diff --git a/docs/releases.rst b/docs/releases.rst index 622f01e0..ae28fbbe 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -28,6 +28,9 @@ New Features - Load scalar variables by default. (:pull:`205`) By `Gustavo Hidalgo `_. +- Support empty files (:pull:`260`) + By `Justus Magin `_. + Breaking changes ~~~~~~~~~~~~~~~~ @@ -35,7 +38,7 @@ Breaking changes By `Gustavo Hidalgo `_. - VirtualiZarr's `ZArray`, `ChunkEntry`, and `Codec` no longer subclass `pydantic.BaseModel` (:pull:`210`) -- `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`xxx`) +- `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`210`) Deprecations ~~~~~~~~~~~~ @@ -55,7 +58,7 @@ Bug fixes Documentation ~~~~~~~~~~~~~ -- Adds virtualizarr + coiled serverless example notebook (:pull`223`) +- Adds virtualizarr + coiled serverless example notebook (:pull:`223`) By `Raphael Hagen `_. diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 88ac9a91..1933844a 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -89,7 +89,7 @@ class ChunkManifest: _offsets: np.ndarray[Any, np.dtype[np.uint64]] _lengths: np.ndarray[Any, np.dtype[np.uint64]] - def __init__(self, entries: dict) -> None: + def __init__(self, entries: dict, shape: tuple[int, ...] | None = None) -> None: """ Create a ChunkManifest from a dictionary mapping zarr chunk keys to byte ranges. @@ -105,13 +105,14 @@ def __init__(self, entries: dict) -> None: "0.1.1": {"path": "s3://bucket/foo.nc", "offset": 400, "length": 100}, } """ + if shape is None and not entries: + raise ValueError("need a chunk grid shape if no chunks given") # TODO do some input validation here first? validate_chunk_keys(entries.keys()) - # TODO should we actually optionally pass chunk grid shape in, - # in case there are not enough chunks to give correct idea of full shape? - shape = get_chunk_grid_shape(entries.keys()) + if shape is None: + shape = get_chunk_grid_shape(entries.keys()) # Initializing to empty implies that entries with path='' are treated as missing chunks paths = cast( # `np.empty` apparently is type hinted as if the output could have Any dtype @@ -386,6 +387,9 @@ def get_ndim_from_key(key: str) -> int: def validate_chunk_keys(chunk_keys: Iterable[ChunkKey]): + if not chunk_keys: + return + # Check if all keys have the correct form for key in chunk_keys: if not re.match(_CHUNK_KEY, key): diff --git a/virtualizarr/readers/kerchunk.py b/virtualizarr/readers/kerchunk.py index d3632b68..a8740b19 100644 --- a/virtualizarr/readers/kerchunk.py +++ b/virtualizarr/readers/kerchunk.py @@ -13,7 +13,7 @@ KerchunkStoreRefs, ) from virtualizarr.utils import _FsspecFSFromFilepath -from virtualizarr.zarr import ZArray, ZAttrs +from virtualizarr.zarr import ZArray, ZAttrs, ceildiv # TODO shouldn't this live in backend.py? Because it's not just useful for the kerchunk-specific readers... @@ -230,6 +230,13 @@ def dataset_from_kerchunk_refs( return vds +def determine_chunk_grid_shape(zarray): + return tuple( + ceildiv(length, chunksize) + for length, chunksize in zip(zarray.shape, zarray.chunks) + ) + + def variable_from_kerchunk_refs( refs: KerchunkStoreRefs, var_name: str, virtual_array_class ) -> Variable: @@ -242,6 +249,12 @@ def variable_from_kerchunk_refs( if chunk_dict: manifest = ChunkManifest._from_kerchunk_chunk_dict(chunk_dict) varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest) + elif len(zarray.shape) != 0: + # empty variables don't have physical chunks, but zarray shows that the variable + # is at least 1D + shape = determine_chunk_grid_shape(zarray) + manifest = ChunkManifest(entries={}, shape=shape) + varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest) else: # This means we encountered a scalar variable of dimension 0, # very likely that it actually has no numeric value and its only purpose diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index 9031195f..f3a9ee9f 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -203,6 +203,38 @@ def test_broadcast_any_shape(self, shape, chunks, target_shape): for len_arr, len_chunk in zip(broadcasted_marr.shape, broadcasted_chunk_shape): assert len_chunk <= len_arr + @pytest.mark.parametrize( + "shape, chunks, grid_shape, target_shape", + [ + ((1,), (1,), (1,), (3,)), + ((2,), (1,), (2,), (2,)), + ((3,), (2,), (2,), (5, 4, 3)), + ((3, 1), (2, 1), (2, 1), (2, 3, 4)), + ], + ) + def test_broadcast_empty(self, shape, chunks, grid_shape, target_shape): + zarray = ZArray( + chunks=chunks, + compressor={"id": "zlib", "level": 1}, + dtype=np.dtype("int32"), + fill_value=0.0, + filters=None, + order="C", + shape=shape, + zarr_format=2, + ) + manifest = ChunkManifest(entries={}, shape=grid_shape) + marr = ManifestArray(zarray, manifest) + + expanded = np.broadcast_to(marr, shape=target_shape) + assert expanded.shape == target_shape + assert len(expanded.chunks) == expanded.ndim + assert all( + len_chunk <= len_arr + for len_arr, len_chunk in zip(expanded.shape, expanded.chunks) + ) + assert expanded.manifest.dict() == {} + # TODO we really need some kind of fixtures to generate useful example data # The hard part is having an alternative way to get to the expected result of concatenation @@ -250,6 +282,44 @@ def test_concat(self): assert result.zarray.order == zarray.order assert result.zarray.zarr_format == zarray.zarr_format + def test_concat_empty(self): + # both manifest arrays in this example have the same zarray properties + zarray = ZArray( + chunks=(5, 1, 10), + compressor={"id": "zlib", "level": 1}, + dtype=np.dtype("int32"), + fill_value=0.0, + filters=None, + order="C", + shape=(5, 1, 20), + zarr_format=2, + ) + + chunks_dict1 = {} + manifest1 = ChunkManifest(entries=chunks_dict1, shape=(1, 1, 2)) + marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1) + + chunks_dict2 = { + "0.0.0": {"path": "foo.nc", "offset": 300, "length": 100}, + "0.0.1": {"path": "foo.nc", "offset": 400, "length": 100}, + } + manifest2 = ChunkManifest(entries=chunks_dict2) + marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2) + + result = np.concatenate([marr1, marr2], axis=1) + + assert result.shape == (5, 2, 20) + assert result.chunks == (5, 1, 10) + assert result.manifest.dict() == { + "0.1.0": {"path": "foo.nc", "offset": 300, "length": 100}, + "0.1.1": {"path": "foo.nc", "offset": 400, "length": 100}, + } + assert result.zarray.compressor == zarray.compressor + assert result.zarray.filters == zarray.filters + assert result.zarray.fill_value == zarray.fill_value + assert result.zarray.order == zarray.order + assert result.zarray.zarr_format == zarray.zarr_format + class TestStack: def test_stack(self): @@ -295,6 +365,44 @@ def test_stack(self): assert result.zarray.order == zarray.order assert result.zarray.zarr_format == zarray.zarr_format + def test_stack_empty(self): + # both manifest arrays in this example have the same zarray properties + zarray = ZArray( + chunks=(5, 10), + compressor={"id": "zlib", "level": 1}, + dtype=np.dtype("int32"), + fill_value=0.0, + filters=None, + order="C", + shape=(5, 20), + zarr_format=2, + ) + + chunks_dict1 = {} + manifest1 = ChunkManifest(entries=chunks_dict1, shape=(1, 2)) + marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1) + + chunks_dict2 = { + "0.0": {"path": "foo.nc", "offset": 300, "length": 100}, + "0.1": {"path": "foo.nc", "offset": 400, "length": 100}, + } + manifest2 = ChunkManifest(entries=chunks_dict2) + marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2) + + result = np.stack([marr1, marr2], axis=1) + + assert result.shape == (5, 2, 20) + assert result.chunks == (5, 1, 10) + assert result.manifest.dict() == { + "0.1.0": {"path": "foo.nc", "offset": 300, "length": 100}, + "0.1.1": {"path": "foo.nc", "offset": 400, "length": 100}, + } + assert result.zarray.compressor == zarray.compressor + assert result.zarray.filters == zarray.filters + assert result.zarray.fill_value == zarray.fill_value + assert result.zarray.order == zarray.order + assert result.zarray.zarr_format == zarray.zarr_format + def test_refuse_combine(): # TODO test refusing to concatenate arrays that have conflicting shapes / chunk sizes diff --git a/virtualizarr/tests/test_manifests/test_manifest.py b/virtualizarr/tests/test_manifests/test_manifest.py index fb099413..3e084e64 100644 --- a/virtualizarr/tests/test_manifests/test_manifest.py +++ b/virtualizarr/tests/test_manifests/test_manifest.py @@ -20,6 +20,14 @@ def test_create_manifest(self): manifest = ChunkManifest(entries=chunks) assert manifest.dict() == chunks + chunks = {} + manifest = ChunkManifest(entries=chunks, shape=(2, 2)) + assert manifest.dict() == chunks + + def test_create_manifest_empty_missing_shape(self): + with pytest.raises(ValueError, match="chunk grid shape if no chunks"): + ChunkManifest(entries={}) + def test_invalid_chunk_entries(self): chunks = { "0.0.0": {"path": "s3://bucket/foo.nc"}, diff --git a/virtualizarr/tests/test_readers/test_kerchunk.py b/virtualizarr/tests/test_readers/test_kerchunk.py index 0faa1ff2..50d4b19b 100644 --- a/virtualizarr/tests/test_readers/test_kerchunk.py +++ b/virtualizarr/tests/test_readers/test_kerchunk.py @@ -1,4 +1,5 @@ import numpy as np +import ujson from virtualizarr.manifests import ManifestArray from virtualizarr.readers.kerchunk import ( @@ -45,8 +46,6 @@ def test_dataset_from_df_refs(): def test_dataset_from_df_refs_with_filters(): - import ujson - filters = [{"elementsize": 4, "id": "shuffle"}, {"id": "zlib", "level": 4}] zarray = { "chunks": [2, 3], @@ -62,3 +61,24 @@ def test_dataset_from_df_refs_with_filters(): ds = dataset_from_kerchunk_refs(ds_refs) da = ds["a"] assert da.data.zarray.filters == filters + + +def test_dataset_from_kerchunk_refs_empty_chunk_manifest(): + zarray = { + "chunks": [50, 100], + "compressor": None, + "dtype": "