diff --git a/docs/release-notes.rst b/docs/release-notes.rst index 3fdf8c847a..2276889cf6 100644 --- a/docs/release-notes.rst +++ b/docs/release-notes.rst @@ -7,6 +7,8 @@ Unreleased Bug fixes ~~~~~~~~~ +* Backwards compatibility for Zarr format 2 structured arrays (:issue:`2134`) + Features ~~~~~~~~ diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst index 871291b72b..3662f75dff 100644 --- a/docs/user-guide/config.rst +++ b/docs/user-guide/config.rst @@ -53,6 +53,7 @@ This is the current default configuration:: 'level': 0}}, 'v2_default_filters': {'bytes': [{'id': 'vlen-bytes'}], 'numeric': None, + 'raw': None, 'string': [{'id': 'vlen-utf8'}]}, 'v3_default_compressors': {'bytes': [{'configuration': {'checksum': False, 'level': 0}, diff --git a/src/zarr/core/buffer/core.py b/src/zarr/core/buffer/core.py index 85a7351fc7..ccab103e0f 100644 --- a/src/zarr/core/buffer/core.py +++ b/src/zarr/core/buffer/core.py @@ -470,7 +470,9 @@ def all_equal(self, other: Any, equal_nan: bool = True) -> bool: # every single time we have to write data? _data, other = np.broadcast_arrays(self._data, other) return np.array_equal( - self._data, other, equal_nan=equal_nan if self._data.dtype.kind not in "USTO" else False + self._data, + other, + equal_nan=equal_nan if self._data.dtype.kind not in "USTOV" else False, ) def fill(self, value: Any) -> None: diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 7920d220a4..051e8c68e1 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -75,6 +75,7 @@ def reset(self) -> None: "numeric": None, "string": [{"id": "vlen-utf8"}], "bytes": [{"id": "vlen-bytes"}], + "raw": None, }, "v3_default_filters": {"numeric": [], "string": [], "bytes": []}, "v3_default_serializer": { diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 29cf15a119..192db5b203 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -193,7 +193,14 @@ def to_dict(self) -> dict[str, JSON]: zarray_dict["fill_value"] = fill_value _ = zarray_dict.pop("dtype") - zarray_dict["dtype"] = self.dtype.str + dtype_json: JSON + # In the case of zarr v2, the simplest i.e., '|VXX' dtype is represented as a string + dtype_descr = self.dtype.descr + if self.dtype.kind == "V" and dtype_descr[0][0] != "" and len(dtype_descr) != 0: + dtype_json = tuple(self.dtype.descr) + else: + dtype_json = self.dtype.str + zarray_dict["dtype"] = dtype_json return zarray_dict @@ -220,6 +227,8 @@ def update_attributes(self, attributes: dict[str, JSON]) -> Self: def parse_dtype(data: npt.DTypeLike) -> np.dtype[Any]: + if isinstance(data, list): # this is a valid _VoidDTypeLike check + data = [tuple(d) for d in data] return np.dtype(data) @@ -376,8 +385,10 @@ def _default_filters( dtype_key = "numeric" elif dtype.kind in "U": dtype_key = "string" - elif dtype.kind in "OSV": + elif dtype.kind in "OS": dtype_key = "bytes" + elif dtype.kind == "V": + dtype_key = "raw" else: raise ValueError(f"Unsupported dtype kind {dtype.kind}") diff --git a/tests/test_config.py b/tests/test_config.py index c552ace840..1a2453d646 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -61,6 +61,7 @@ def test_config_defaults_set() -> None: "numeric": None, "string": [{"id": "vlen-utf8"}], "bytes": [{"id": "vlen-bytes"}], + "raw": None, }, "v3_default_filters": {"numeric": [], "string": [], "bytes": []}, "v3_default_serializer": { diff --git a/tests/test_v2.py b/tests/test_v2.py index b657af9c47..4c689c8e64 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -84,8 +84,15 @@ def test_codec_pipeline() -> None: np.testing.assert_array_equal(result, expected) -@pytest.mark.parametrize("dtype", ["|S", "|V"]) -async def test_v2_encode_decode(dtype): +@pytest.mark.parametrize( + ("dtype", "expected_dtype", "fill_value", "fill_value_encoding"), + [ + ("|S", "|S0", b"X", "WA=="), + ("|V", "|V0", b"X", "WA=="), + ("|V10", "|V10", b"X", "WAAAAAAAAAAAAA=="), + ], +) +async def test_v2_encode_decode(dtype, expected_dtype, fill_value, fill_value_encoding) -> None: with config.set( { "array.v2_default_filters.bytes": [{"id": "vlen-bytes"}], @@ -95,7 +102,7 @@ async def test_v2_encode_decode(dtype): store = zarr.storage.MemoryStore() g = zarr.group(store=store, zarr_format=2) g.create_array( - name="foo", shape=(3,), chunks=(3,), dtype=dtype, fill_value=b"X", compressor=None + name="foo", shape=(3,), chunks=(3,), dtype=dtype, fill_value=fill_value, compressor=None ) result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype()) @@ -105,9 +112,9 @@ async def test_v2_encode_decode(dtype): expected = { "chunks": [3], "compressor": None, - "dtype": f"{dtype}0", - "fill_value": "WA==", - "filters": [{"id": "vlen-bytes"}], + "dtype": expected_dtype, + "fill_value": fill_value_encoding, + "filters": [{"id": "vlen-bytes"}] if dtype == "|S" else None, "order": "C", "shape": [3], "zarr_format": 2, @@ -284,3 +291,25 @@ def test_default_filters_and_compressor(dtype_expected: Any) -> None: assert arr.metadata.compressor.codec_id == expected_compressor if expected_filter is not None: assert arr.metadata.filters[0].codec_id == expected_filter + + +@pytest.mark.parametrize("fill_value", [None, (b"", 0, 0.0)], ids=["no_fill", "fill"]) +def test_structured_dtype_roundtrip(fill_value, tmp_path) -> None: + a = np.array( + [(b"aaa", 1, 4.2), (b"bbb", 2, 8.4), (b"ccc", 3, 12.6)], + dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")], + ) + array_path = tmp_path / "data.zarr" + za = zarr.create( + shape=(3,), + store=array_path, + chunks=(2,), + fill_value=fill_value, + zarr_format=2, + dtype=a.dtype, + ) + if fill_value is not None: + assert (np.array([fill_value] * a.shape[0], dtype=a.dtype) == za[:]).all() + za[...] = a + za = zarr.open_array(store=array_path) + assert (a == za[:]).all()