diff --git a/Cargo.toml b/Cargo.toml index b305263..66b9e1c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,7 @@ pyo3-stub-gen = "0.6.2" opendal = { version = "0.51.0", features = ["services-http"] } tokio = { version = "1.41.1", features = ["rt-multi-thread"] } zarrs_opendal = "0.5.0" +zarrs_metadata = "0.3.3" # require recent zarr-python compatibility fixes (remove with zarrs 0.20) [profile.release] lto = true diff --git a/README.md b/README.md index d9860db..c94befe 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ You can then use your `zarr` as normal (with some caveats)! ## API -We export a `ZarrsCodecPipeline` class so that `zarr-python` can use the class but it is not meant to be instantiated and we do not guarantee the stability of its API beyond what is required so that `zarr-python` can use it. Therefore, it is not documented here. We also export two errors, `DiscontiguousArrayError` and `CollapsedDimensionError` that can be thrown in the process of converting to indexers that `zarrs` can understand (see below for more details). +We export a `ZarrsCodecPipeline` class so that `zarr-python` can use the class but it is not meant to be instantiated and we do not guarantee the stability of its API beyond what is required so that `zarr-python` can use it. Therefore, it is not documented here. At the moment, we only support a subset of the `zarr-python` stores: @@ -86,7 +86,7 @@ Chunk concurrency is typically favored because: ## Supported Indexing Methods -We **do not** officially support the following indexing methods. Some of these methods may error out, others may not: +The following methods will trigger use with the old zarr-python pipeline: 1. Any `oindex` or `vindex` integer `np.ndarray` indexing with dimensionality >=3 i.e., @@ -116,6 +116,9 @@ We **do not** officially support the following indexing methods. Some of these arr[0:10, ..., 0:5] ``` -Otherwise, we believe that we support your indexing case: slices, ints, and all integer `np.ndarray` indices in 2D for reading, contiguous integer `np.ndarray` indices along one axis for writing etc. Please file an issue if you believe we have more holes in our coverage than we are aware of or you wish to contribute! For example, we have an [issue in zarrs for integer-array indexing](https://github.com/LDeakin/zarrs/issues/52) that would unblock a lot of these issues! -That being said, using non-contiguous integer `np.ndarray` indexing for reads may not be as fast as expected given the performance of other supported methods. Until `zarrs` supports integer indexing, only fetching chunks is done in `rust` while indexing then occurs in `python`. +Furthermore, using anything except contiguous (i.e., slices or consecutive integer) `np.ndarray` for numeric data will fall back to the default `zarr-python` implementation. + +Please file an issue if you believe we have more holes in our coverage than we are aware of or you wish to contribute! For example, we have an [issue in zarrs for integer-array indexing](https://github.com/LDeakin/zarrs/issues/52) that would unblock a lot the use of the rust pipeline for that use-case (very useful for mini-batch training perhaps!). + +Further, any codecs not supported by `zarrs` will also automatically fall back to the python implementation. diff --git a/docs/api.md b/docs/api.md deleted file mode 100644 index f02ad87..0000000 --- a/docs/api.md +++ /dev/null @@ -1,13 +0,0 @@ -# Exceptions - -```{eval-rst} -.. module:: zarrs -``` - -```{eval-rst} -.. autosummary:: - :toctree: generated/ - - DiscontiguousArrayError - CollapsedDimensionError -``` diff --git a/docs/index.md b/docs/index.md index 5f590ef..aef0562 100644 --- a/docs/index.md +++ b/docs/index.md @@ -5,6 +5,5 @@ :hidden: true :maxdepth: 1 -api contributing ``` diff --git a/pyproject.toml b/pyproject.toml index c2f9885..509e930 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ dependencies = [ 'donfig', 'pytest', 'universal_pathlib>=0.2.0', - 'zarr>=3.0.0', + "zarr", ] [project.optional-dependencies] diff --git a/python/zarrs/_internal.pyi b/python/zarrs/_internal.pyi index f9c1bf5..2086b99 100644 --- a/python/zarrs/_internal.pyi +++ b/python/zarrs/_internal.pyi @@ -4,7 +4,6 @@ import typing from enum import Enum, auto -import numpy import numpy.typing class Basic: @@ -27,9 +26,6 @@ class CodecPipelineImpl: chunk_descriptions: typing.Sequence[WithSubset], value: numpy.typing.NDArray[typing.Any], ) -> None: ... - def retrieve_chunks( - self, chunk_descriptions: typing.Sequence[Basic] - ) -> list[numpy.typing.NDArray[numpy.uint8]]: ... def store_chunks_with_indices( self, chunk_descriptions: typing.Sequence[WithSubset], diff --git a/python/zarrs/pipeline.py b/python/zarrs/pipeline.py index a211618..0f089e5 100644 --- a/python/zarrs/pipeline.py +++ b/python/zarrs/pipeline.py @@ -2,15 +2,17 @@ import asyncio import json +import re from dataclasses import dataclass from typing import TYPE_CHECKING, TypedDict import numpy as np from zarr.abc.codec import Codec, CodecPipeline +from zarr.core import BatchedCodecPipeline from zarr.core.config import config if TYPE_CHECKING: - from collections.abc import Iterable, Iterator + from collections.abc import Generator, Iterable, Iterator from typing import Any, Self from zarr.abc.store import ByteGetter, ByteSetter @@ -20,28 +22,64 @@ from zarr.core.common import ChunkCoords from zarr.core.indexing import SelectorTuple -from ._internal import CodecPipelineImpl +from ._internal import CodecPipelineImpl, codec_metadata_v2_to_v3 from .utils import ( CollapsedDimensionError, DiscontiguousArrayError, - make_chunk_info_for_rust, + FillValueNoneError, make_chunk_info_for_rust_with_indices, ) -def get_codec_pipeline_impl(codec_metadata_json: str) -> CodecPipelineImpl: - return CodecPipelineImpl( - codec_metadata_json, - validate_checksums=config.get("codec_pipeline.validate_checksums", None), - store_empty_chunks=config.get("array.write_empty_chunks", None), - chunk_concurrent_minimum=config.get( - "codec_pipeline.chunk_concurrent_minimum", None - ), - chunk_concurrent_maximum=config.get( - "codec_pipeline.chunk_concurrent_maximum", None - ), - num_threads=config.get("threading.max_workers", None), - ) +class UnsupportedDataTypeError(Exception): + pass + + +class UnsupportedMetadataError(Exception): + pass + + +def get_codec_pipeline_impl(codec_metadata_json: str) -> CodecPipelineImpl | None: + try: + return CodecPipelineImpl( + codec_metadata_json, + validate_checksums=config.get("codec_pipeline.validate_checksums", None), + store_empty_chunks=config.get("array.write_empty_chunks", None), + chunk_concurrent_minimum=config.get( + "codec_pipeline.chunk_concurrent_minimum", None + ), + chunk_concurrent_maximum=config.get( + "codec_pipeline.chunk_concurrent_maximum", None + ), + num_threads=config.get("threading.max_workers", None), + ) + except TypeError as e: + if re.match(r"codec (delta|zlib) is not supported", str(e)): + return None + else: + raise e + + +def codecs_to_dict(codecs: Iterable[Codec]) -> Generator[dict[str, Any], None, None]: + for codec in codecs: + if codec.__class__.__name__ == "V2Codec": + codec_dict = codec.to_dict() + if codec_dict.get("filters", None) is not None: + filters = [ + json.dumps(filter.get_config()) + for filter in codec_dict.get("filters") + ] + else: + filters = None + if codec_dict.get("compressor", None) is not None: + compressor = json.dumps(codec_dict.get("compressor").get_config()) + else: + compressor = None + codecs_v3 = codec_metadata_v2_to_v3(filters, compressor) + for codec in codecs_v3: + yield json.loads(codec) + else: + yield codec.to_dict() class ZarrsCodecPipelineState(TypedDict): @@ -52,8 +90,9 @@ class ZarrsCodecPipelineState(TypedDict): @dataclass class ZarrsCodecPipeline(CodecPipeline): codecs: tuple[Codec, ...] - impl: CodecPipelineImpl + impl: CodecPipelineImpl | None codec_metadata_json: str + python_impl: BatchedCodecPipeline def __getstate__(self) -> ZarrsCodecPipelineState: return {"codec_metadata_json": self.codec_metadata_json, "codecs": self.codecs} @@ -62,13 +101,14 @@ def __setstate__(self, state: ZarrsCodecPipelineState): self.codecs = state["codecs"] self.codec_metadata_json = state["codec_metadata_json"] self.impl = get_codec_pipeline_impl(self.codec_metadata_json) + self.python_impl = BatchedCodecPipeline.from_codecs(self.codecs) def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: raise NotImplementedError("evolve_from_array_spec") @classmethod def from_codecs(cls, codecs: Iterable[Codec]) -> Self: - codec_metadata = [codec.to_dict() for codec in codecs] + codec_metadata = list(codecs_to_dict(codecs)) codec_metadata_json = json.dumps(codec_metadata) # TODO: upstream zarr-python has not settled on how to deal with configs yet # Should they be checked when an array is created, or when an operation is performed? @@ -78,6 +118,7 @@ def from_codecs(cls, codecs: Iterable[Codec]) -> Self: codec_metadata_json=codec_metadata_json, codecs=tuple(codecs), impl=get_codec_pipeline_impl(codec_metadata_json), + python_impl=BatchedCodecPipeline.from_codecs(codecs), ) @property @@ -120,29 +161,32 @@ async def read( drop_axes: tuple[int, ...] = (), # FIXME: unused ) -> None: # FIXME: Error if array is not in host memory - out: NDArrayLike = out.as_ndarray_like() if not out.dtype.isnative: raise RuntimeError("Non-native byte order not supported") try: + if self.impl is None: + raise UnsupportedMetadataError() + self._raise_error_on_unsupported_batch_dtype(batch_info) chunks_desc = make_chunk_info_for_rust_with_indices( batch_info, drop_axes, out.shape ) - except (DiscontiguousArrayError, CollapsedDimensionError): - chunks_desc = make_chunk_info_for_rust(batch_info) + except ( + UnsupportedMetadataError, + DiscontiguousArrayError, + CollapsedDimensionError, + UnsupportedDataTypeError, + FillValueNoneError, + ): + await self.python_impl.read(batch_info, out, drop_axes) + return None else: + out: NDArrayLike = out.as_ndarray_like() await asyncio.to_thread( self.impl.retrieve_chunks_and_apply_index, chunks_desc, out, ) return None - chunks = await asyncio.to_thread(self.impl.retrieve_chunks, chunks_desc) - for chunk, (_, spec, selection, out_selection) in zip(chunks, batch_info): - chunk_reshaped = chunk.view(spec.dtype).reshape(spec.shape) - chunk_selected = chunk_reshaped[selection] - if drop_axes: - chunk_selected = np.squeeze(chunk_selected, axis=drop_axes) - out[out_selection] = chunk_selected async def write( self, @@ -152,14 +196,46 @@ async def write( value: NDBuffer, # type: ignore drop_axes: tuple[int, ...] = (), ) -> None: - # FIXME: Error if array is not in host memory - value: NDArrayLike | np.ndarray = value.as_ndarray_like() - if not value.dtype.isnative: - value = np.ascontiguousarray(value, dtype=value.dtype.newbyteorder("=")) - elif not value.flags.c_contiguous: - value = np.ascontiguousarray(value) - chunks_desc = make_chunk_info_for_rust_with_indices( - batch_info, drop_axes, value.shape - ) - await asyncio.to_thread(self.impl.store_chunks_with_indices, chunks_desc, value) - return None + try: + if self.impl is None: + raise UnsupportedMetadataError() + self._raise_error_on_unsupported_batch_dtype(batch_info) + chunks_desc = make_chunk_info_for_rust_with_indices( + batch_info, drop_axes, value.shape + ) + except ( + UnsupportedMetadataError, + DiscontiguousArrayError, + CollapsedDimensionError, + UnsupportedDataTypeError, + FillValueNoneError, + ): + await self.python_impl.write(batch_info, value, drop_axes) + return None + else: + # FIXME: Error if array is not in host memory + value_np: NDArrayLike | np.ndarray = value.as_ndarray_like() + if not value_np.dtype.isnative: + value_np = np.ascontiguousarray( + value_np, dtype=value_np.dtype.newbyteorder("=") + ) + elif not value_np.flags.c_contiguous: + value_np = np.ascontiguousarray(value_np) + await asyncio.to_thread( + self.impl.store_chunks_with_indices, chunks_desc, value_np + ) + return None + + def _raise_error_on_unsupported_batch_dtype( + self, + batch_info: Iterable[ + tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple] + ], + ): + # https://github.com/LDeakin/zarrs/blob/0532fe983b7b42b59dbf84e50a2fe5e6f7bad4ce/zarrs_metadata/src/v2_to_v3.rs#L289-L293 for VSUMm + # Further, our pipeline does not support variable-length objects due to limitations on decode_into, so object is also out + if any( + info.dtype.kind in {"V", "S", "U", "M", "m", "O"} + for (_, info, _, _) in batch_info + ): + raise UnsupportedDataTypeError() diff --git a/python/zarrs/utils.py b/python/zarrs/utils.py index 0aa0d0d..e4058e4 100644 --- a/python/zarrs/utils.py +++ b/python/zarrs/utils.py @@ -3,10 +3,12 @@ import operator import os from functools import reduce -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any import numpy as np +from zarr.core.array_spec import ArraySpec from zarr.core.indexing import SelectorTuple, is_integer +from zarr.core.metadata.v2 import _default_fill_value from zarrs._internal import Basic, WithSubset @@ -15,7 +17,6 @@ from types import EllipsisType from zarr.abc.store import ByteGetter, ByteSetter - from zarr.core.array_spec import ArraySpec # adapted from https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ThreadPoolExecutor @@ -31,6 +32,10 @@ class CollapsedDimensionError(Exception): pass +class FillValueNoneError(Exception): + pass + + # This is a (mostly) copy of the function from zarr.core.indexing that fixes: # DeprecationWarning: Conversion of an array with ndim > 0 to a scalar is deprecated # TODO: Upstream this fix @@ -134,6 +139,12 @@ def get_shape_for_selector( return resulting_shape_from_index(shape, selector_tuple, drop_axes, pad=pad) +def get_implicit_fill_value(dtype: np.dtype, fill_value: Any) -> Any: + if fill_value is None: + fill_value = _default_fill_value(dtype) + return fill_value + + def make_chunk_info_for_rust_with_indices( batch_info: Iterable[ tuple[ByteGetter | ByteSetter, ArraySpec, SelectorTuple, SelectorTuple] @@ -144,6 +155,14 @@ def make_chunk_info_for_rust_with_indices( shape = shape if shape else (1,) # constant array chunk_info_with_indices: list[WithSubset] = [] for byte_getter, chunk_spec, chunk_selection, out_selection in batch_info: + if chunk_spec.fill_value is None: + chunk_spec = ArraySpec( + chunk_spec.shape, + chunk_spec.dtype, + get_implicit_fill_value(chunk_spec.dtype, chunk_spec.fill_value), + chunk_spec.config, + chunk_spec.prototype, + ) chunk_info = Basic(byte_getter, chunk_spec) out_selection_as_slices = selector_tuple_to_slice_selection(out_selection) chunk_selection_as_slices = selector_tuple_to_slice_selection(chunk_selection) @@ -169,14 +188,3 @@ def make_chunk_info_for_rust_with_indices( ) ) return chunk_info_with_indices - - -def make_chunk_info_for_rust( - batch_info: Iterable[ - tuple[ByteGetter | ByteSetter, ArraySpec, SelectorTuple, SelectorTuple] - ], -) -> list[Basic]: - return [ - Basic(byte_interface, chunk_spec) - for (byte_interface, chunk_spec, _, _) in batch_info - ] diff --git a/src/chunk_item.rs b/src/chunk_item.rs index 785bed0..7bdea4b 100644 --- a/src/chunk_item.rs +++ b/src/chunk_item.rs @@ -3,7 +3,7 @@ use std::num::NonZeroU64; use pyo3::{ exceptions::{PyRuntimeError, PyValueError}, pyclass, pymethods, - types::{PyAnyMethods as _, PySlice, PySliceMethods as _}, + types::{PyAnyMethods, PyBytes, PyBytesMethods, PyInt, PySlice, PySliceMethods as _}, Bound, PyAny, PyErr, PyResult, }; use pyo3_stub_gen::derive::{gen_stub_pyclass, gen_stub_pymethods}; @@ -31,6 +31,32 @@ pub(crate) struct Basic { representation: ChunkRepresentation, } +fn fill_value_to_bytes(dtype: &str, fill_value: &Bound<'_, PyAny>) -> PyResult> { + if dtype == "string" { + // Match zarr-python 2.x.x string fill value behaviour with a 0 fill value + // See https://github.com/zarr-developers/zarr-python/issues/2792#issuecomment-2644362122 + if let Ok(fill_value_downcast) = fill_value.downcast::() { + let fill_value_usize: usize = fill_value_downcast.extract()?; + if fill_value_usize == 0 { + return Ok(vec![]); + } + Err(PyErr::new::(format!( + "Cannot understand non-zero integer {fill_value_usize} fill value for dtype {dtype}" + )))?; + } + } + + if let Ok(fill_value_downcast) = fill_value.downcast::() { + Ok(fill_value_downcast.as_bytes().to_vec()) + } else if fill_value.hasattr("tobytes")? { + Ok(fill_value.call_method0("tobytes")?.extract()?) + } else { + Err(PyErr::new::(format!( + "Unsupported fill value {fill_value:?}" + ))) + } +} + #[gen_stub_pymethods] #[pymethods] impl Basic { @@ -40,18 +66,21 @@ impl Basic { let path: String = byte_interface.getattr("path")?.extract()?; let chunk_shape = chunk_spec.getattr("shape")?.extract()?; - let dtype: String = chunk_spec + let mut dtype: String = chunk_spec .getattr("dtype")? .call_method0("__str__")? .extract()?; - let fill_value = chunk_spec - .getattr("fill_value")? - .call_method0("tobytes")? - .extract()?; + if dtype == "object" { + // zarrs doesn't understand `object` which is the output of `np.dtype("|O").__str__()` + // but maps it to "string" internally https://github.com/LDeakin/zarrs/blob/0532fe983b7b42b59dbf84e50a2fe5e6f7bad4ce/zarrs_metadata/src/v2_to_v3.rs#L288 + dtype = String::from("string"); + } + let fill_value: Bound<'_, PyAny> = chunk_spec.getattr("fill_value")?; + let fill_value_bytes = fill_value_to_bytes(&dtype, &fill_value)?; Ok(Self { store, key: StoreKey::new(path).map_py_err::()?, - representation: get_chunk_representation(chunk_shape, &dtype, fill_value)?, + representation: get_chunk_representation(chunk_shape, &dtype, fill_value_bytes)?, }) } } diff --git a/src/lib.rs b/src/lib.rs index e9c54f8..cae087c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,7 +6,7 @@ use std::ptr::NonNull; use std::sync::Arc; use numpy::npyffi::PyArrayObject; -use numpy::{IntoPyArray, PyArray1, PyArrayDescrMethods, PyUntypedArray, PyUntypedArrayMethods}; +use numpy::{PyArrayDescrMethods, PyUntypedArray, PyUntypedArrayMethods}; use pyo3::exceptions::{PyRuntimeError, PyTypeError, PyValueError}; use pyo3::prelude::*; use pyo3_stub_gen::define_stub_info_gatherer; @@ -23,6 +23,7 @@ use zarrs::metadata::v3::MetadataV3; mod chunk_item; mod concurrency; +mod metadata_v2; mod runtime; mod store; #[cfg(test)] @@ -31,6 +32,7 @@ mod utils; use crate::chunk_item::ChunksItem; use crate::concurrency::ChunkConcurrentLimitAndCodecOptions; +use crate::metadata_v2::codec_metadata_v2_to_v3; use crate::store::StoreManager; use crate::utils::{PyErrExt as _, PyUntypedArrayExt as _}; @@ -264,6 +266,9 @@ impl CodecPipelineImpl { }; py.allow_threads(move || { + // FIXME: the `decode_into` methods only support fixed length data types. + // For variable length data types, need a codepath with non `_into` methods. + // Collect all the subsets and copy into value on the Python side? let update_chunk_subset = |item: chunk_item::WithSubset| { // See zarrs::array::Array::retrieve_chunk_subset_into if item.chunk_subset.start().iter().all(|&o| o == 0) @@ -337,54 +342,6 @@ impl CodecPipelineImpl { }) } - fn retrieve_chunks<'py>( - &self, - py: Python<'py>, - chunk_descriptions: Vec, // FIXME: Ref / iterable? - ) -> PyResult>>> { - // Adjust the concurrency based on the codec chain and the first chunk description - let Some((chunk_concurrent_limit, codec_options)) = - chunk_descriptions.get_chunk_concurrent_limit_and_codec_options(self)? - else { - return Ok(vec![]); - }; - - let chunk_bytes = py.allow_threads(move || { - let get_chunk_subset = |item: chunk_item::Basic| { - Ok(if let Some(chunk_encoded) = self.stores.get(&item)? { - let chunk_encoded: Vec = chunk_encoded.into(); - self.codec_chain - .decode( - Cow::Owned(chunk_encoded), - item.representation(), - &codec_options, - ) - .map_py_err::()? - } else { - // The chunk is missing so we need to create one. - let num_elements = item.representation().num_elements(); - let data_type_size = item.representation().data_type().size(); - let chunk_shape = ArraySize::new(data_type_size, num_elements); - ArrayBytes::new_fill_value(chunk_shape, item.representation().fill_value()) - } - .into_fixed() - .map_py_err::()? - .into_owned()) - }; - iter_concurrent_limit!( - chunk_concurrent_limit, - chunk_descriptions, - map, - get_chunk_subset - ) - .collect::>>>() - })?; - Ok(chunk_bytes - .into_iter() - .map(|x| x.into_pyarray(py)) - .collect()) - } - fn store_chunks_with_indices( &self, py: Python, @@ -399,6 +356,7 @@ impl CodecPipelineImpl { // Get input array let input_slice = Self::nparray_to_slice(value)?; let input = if value.ndim() > 0 { + // FIXME: Handle variable length data types, convert value to bytes and offsets InputValue::Array(ArrayBytes::new_flen(Cow::Borrowed(input_slice))) } else { InputValue::Constant(FillValue::new(input_slice.to_vec())) @@ -468,6 +426,7 @@ fn _internal(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_function(wrap_pyfunction!(codec_metadata_v2_to_v3, m)?)?; Ok(()) } diff --git a/src/metadata_v2.rs b/src/metadata_v2.rs new file mode 100644 index 0000000..de14123 --- /dev/null +++ b/src/metadata_v2.rs @@ -0,0 +1,54 @@ +use pyo3::{exceptions::PyRuntimeError, pyfunction, PyErr, PyResult}; +use zarrs::metadata::{ + v2::{array::ArrayMetadataV2Order, MetadataV2}, + v3::array::data_type::DataTypeMetadataV3, +}; + +#[pyfunction] +#[pyo3(signature = (filters=None, compressor=None))] +pub fn codec_metadata_v2_to_v3( + filters: Option>, + compressor: Option, +) -> PyResult> { + // Try and convert filters/compressor to V2 metadata + let filters = if let Some(filters) = filters { + Some( + filters + .into_iter() + .map(|filter| { + serde_json::from_str::(&filter) + .map_err(|err| PyErr::new::(err.to_string())) + }) + .collect::, _>>()?, + ) + } else { + None + }; + let compressor = if let Some(compressor) = compressor { + Some( + serde_json::from_str::(&compressor) + .map_err(|err| PyErr::new::(err.to_string()))?, + ) + } else { + None + }; + + // FIXME: The array order, dimensionality, data type, and endianness are needed to exhaustively support all Zarr V2 data that zarrs can handle. + // However, CodecPipeline.from_codecs does not supply this information, and CodecPipeline.evolve_from_array_spec is seemingly never called. + let metadata = zarrs::metadata::v2_to_v3::codec_metadata_v2_to_v3( + ArrayMetadataV2Order::C, + 0, // unused with C order + &DataTypeMetadataV3::Bool, // FIXME + None, + &filters, + &compressor, + ) + .map_err(|err| { + // TODO: More informative error messages from zarrs for ArrayMetadataV2ToV3ConversionError + PyErr::new::(err.to_string()) + })?; + Ok(metadata + .into_iter() + .map(|metadata| serde_json::to_string(&metadata).expect("infallible")) // TODO: Add method to zarrs + .collect()) +} diff --git a/tests/conftest.py b/tests/conftest.py index 7fc6670..a9f62d2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -33,6 +33,7 @@ class ArrayRequest: @pytest.fixture(autouse=True) def _setup_codec_pipeline(): config.set({"codec_pipeline.path": "zarrs.ZarrsCodecPipeline"}) + pass async def parse_store( @@ -67,69 +68,50 @@ def array_fixture(request: pytest.FixtureRequest) -> npt.NDArray[Any]: # tests that also fail with zarr-python's default codec pipeline zarr_python_default_codec_pipeline_failures = [ - # ellipsis weirdness, need to report - "test_roundtrip[oindex-2d-contiguous_in_chunk_array-ellipsis]", - "test_roundtrip[oindex-2d-discontinuous_in_chunk_array-ellipsis]", - "test_roundtrip[vindex-2d-contiguous_in_chunk_array-ellipsis]", - "test_roundtrip[vindex-2d-discontinuous_in_chunk_array-ellipsis]", - "test_roundtrip[oindex-2d-across_chunks_indices_array-ellipsis]", - "test_roundtrip[vindex-2d-ellipsis-across_chunks_indices_array]", - "test_roundtrip[vindex-2d-across_chunks_indices_array-ellipsis]", - "test_roundtrip[vindex-2d-ellipsis-contiguous_in_chunk_array]", - "test_roundtrip[vindex-2d-ellipsis-discontinuous_in_chunk_array]", - "test_roundtrip_read_only_zarrs[oindex-2d-contiguous_in_chunk_array-ellipsis]", - "test_roundtrip_read_only_zarrs[oindex-2d-discontinuous_in_chunk_array-ellipsis]", - "test_roundtrip_read_only_zarrs[vindex-2d-contiguous_in_chunk_array-ellipsis]", - "test_roundtrip_read_only_zarrs[vindex-2d-discontinuous_in_chunk_array-ellipsis]", - "test_roundtrip_read_only_zarrs[oindex-2d-across_chunks_indices_array-ellipsis]", - "test_roundtrip_read_only_zarrs[vindex-2d-ellipsis-across_chunks_indices_array]", - "test_roundtrip_read_only_zarrs[vindex-2d-across_chunks_indices_array-ellipsis]", - "test_roundtrip_read_only_zarrs[vindex-2d-ellipsis-contiguous_in_chunk_array]", - "test_roundtrip_read_only_zarrs[vindex-2d-ellipsis-discontinuous_in_chunk_array]", + # ellipsis weirdness, need to report, v3 + "test_roundtrip[oindex-2d-contiguous_in_chunk_array-ellipsis-v3]", + "test_roundtrip[oindex-2d-discontinuous_in_chunk_array-ellipsis-v3]", + "test_roundtrip[vindex-2d-contiguous_in_chunk_array-ellipsis-v3]", + "test_roundtrip[vindex-2d-discontinuous_in_chunk_array-ellipsis-v3]", + "test_roundtrip[oindex-2d-across_chunks_indices_array-ellipsis-v3]", + "test_roundtrip[vindex-2d-ellipsis-across_chunks_indices_array-v3]", + "test_roundtrip[vindex-2d-across_chunks_indices_array-ellipsis-v3]", + "test_roundtrip[vindex-2d-ellipsis-contiguous_in_chunk_array-v3]", + "test_roundtrip[vindex-2d-ellipsis-discontinuous_in_chunk_array-v3]", + "test_roundtrip_read_only_zarrs[oindex-2d-contiguous_in_chunk_array-ellipsis-v3]", + "test_roundtrip_read_only_zarrs[oindex-2d-discontinuous_in_chunk_array-ellipsis-v3]", + "test_roundtrip_read_only_zarrs[vindex-2d-contiguous_in_chunk_array-ellipsis-v3]", + "test_roundtrip_read_only_zarrs[vindex-2d-discontinuous_in_chunk_array-ellipsis-v3]", + "test_roundtrip_read_only_zarrs[oindex-2d-across_chunks_indices_array-ellipsis-v3]", + "test_roundtrip_read_only_zarrs[vindex-2d-ellipsis-across_chunks_indices_array-v3]", + "test_roundtrip_read_only_zarrs[vindex-2d-across_chunks_indices_array-ellipsis-v3]", + "test_roundtrip_read_only_zarrs[vindex-2d-ellipsis-contiguous_in_chunk_array-v3]", + "test_roundtrip_read_only_zarrs[vindex-2d-ellipsis-discontinuous_in_chunk_array-v3]", + # v2 + "test_roundtrip[oindex-2d-contiguous_in_chunk_array-ellipsis-v2]", + "test_roundtrip[oindex-2d-discontinuous_in_chunk_array-ellipsis-v2]", + "test_roundtrip[vindex-2d-contiguous_in_chunk_array-ellipsis-v2]", + "test_roundtrip[vindex-2d-discontinuous_in_chunk_array-ellipsis-v2]", + "test_roundtrip[oindex-2d-across_chunks_indices_array-ellipsis-v2]", + "test_roundtrip[vindex-2d-ellipsis-across_chunks_indices_array-v2]", + "test_roundtrip[vindex-2d-across_chunks_indices_array-ellipsis-v2]", + "test_roundtrip[vindex-2d-ellipsis-contiguous_in_chunk_array-v2]", + "test_roundtrip[vindex-2d-ellipsis-discontinuous_in_chunk_array-v2]", + "test_roundtrip_read_only_zarrs[oindex-2d-contiguous_in_chunk_array-ellipsis-v2]", + "test_roundtrip_read_only_zarrs[oindex-2d-discontinuous_in_chunk_array-ellipsis-v2]", + "test_roundtrip_read_only_zarrs[vindex-2d-contiguous_in_chunk_array-ellipsis-v2]", + "test_roundtrip_read_only_zarrs[vindex-2d-discontinuous_in_chunk_array-ellipsis-v2]", + "test_roundtrip_read_only_zarrs[oindex-2d-across_chunks_indices_array-ellipsis-v2]", + "test_roundtrip_read_only_zarrs[vindex-2d-ellipsis-across_chunks_indices_array-v2]", + "test_roundtrip_read_only_zarrs[vindex-2d-across_chunks_indices_array-ellipsis-v2]", + "test_roundtrip_read_only_zarrs[vindex-2d-ellipsis-contiguous_in_chunk_array-v2]", + "test_roundtrip_read_only_zarrs[vindex-2d-ellipsis-discontinuous_in_chunk_array-v2]", # need to investigate this one - it seems to fail with the default pipeline # but it makes some sense that it succeeds with ours since we fall-back to numpy indexing # in the case of a collapsed dimension # "test_roundtrip_read_only_zarrs[vindex-2d-contiguous_in_chunk_array-contiguous_in_chunk_array]", ] -zarrs_python_no_discontinuous_writes = [ - "test_roundtrip[oindex-2d-discontinuous_in_chunk_array-slice_in_chunk]", - "test_roundtrip[oindex-2d-discontinuous_in_chunk_array-slice_across_chunks]", - "test_roundtrip[oindex-2d-discontinuous_in_chunk_array-full_slice]", - "test_roundtrip[oindex-2d-discontinuous_in_chunk_array-int]", - "test_roundtrip[oindex-2d-slice_in_chunk-discontinuous_in_chunk_array]", - "test_roundtrip[oindex-2d-slice_across_chunks-discontinuous_in_chunk_array]", - "test_roundtrip[oindex-2d-full_slice-discontinuous_in_chunk_array]", - "test_roundtrip[oindex-2d-int-discontinuous_in_chunk_array]", - "test_roundtrip[oindex-2d-ellipsis-discontinuous_in_chunk_array]", - "test_roundtrip[vindex-2d-discontinuous_in_chunk_array-slice_in_chunk]", - "test_roundtrip[vindex-2d-discontinuous_in_chunk_array-slice_across_chunks]", - "test_roundtrip[vindex-2d-discontinuous_in_chunk_array-full_slice]", - "test_roundtrip[vindex-2d-discontinuous_in_chunk_array-int]", - "test_roundtrip[vindex-2d-slice_in_chunk-discontinuous_in_chunk_array]", - "test_roundtrip[vindex-2d-slice_across_chunks-discontinuous_in_chunk_array]", - "test_roundtrip[vindex-2d-full_slice-discontinuous_in_chunk_array]", - "test_roundtrip[vindex-2d-int-discontinuous_in_chunk_array]", - "test_roundtrip[oindex-2d-discontinuous_in_chunk_array-contiguous_in_chunk_array]", - "test_roundtrip[oindex-2d-contiguous_in_chunk_array-discontinuous_in_chunk_array]", - "test_roundtrip[oindex-2d-across_chunks_indices_array-discontinuous_in_chunk_array]", - "test_roundtrip[oindex-2d-discontinuous_in_chunk_array-discontinuous_in_chunk_array]", - "test_roundtrip[vindex-2d-contiguous_in_chunk_array-discontinuous_in_chunk_array]", - "test_roundtrip[vindex-2d-discontinuous_in_chunk_array-discontinuous_in_chunk_array]", - "test_roundtrip[oindex-2d-discontinuous_in_chunk_array-across_chunks_indices_array]", - "test_roundtrip[vindex-2d-discontinuous_in_chunk_array-contiguous_in_chunk_array]", - "test_roundtrip[oindex-1d-discontinuous_in_chunk_array]", - "test_roundtrip[vindex-1d-discontinuous_in_chunk_array]", -] - -# vindexing with two contiguous arrays would be converted to two slices but -# in numpy indexing actually requires dropping a dimension, which in turn boils -# down to integer indexing, which we can't do i.e., [np.array(1, 2), np.array(1, 2)] -> [slice(1, 3), slice(1, 3)] -# is not a correct conversion, and thus we don't support the write operation -zarrs_python_no_collapsed_dim = [ - "test_roundtrip[vindex-2d-contiguous_in_chunk_array-contiguous_in_chunk_array]" -] - def pytest_collection_modifyitems( config: pytest.Config, items: Iterable[pytest.Item] @@ -140,15 +122,3 @@ def pytest_collection_modifyitems( reason="This test fails with the zarr-python default codec pipeline." ) item.add_marker(xfail_marker) - if item.name in zarrs_python_no_discontinuous_writes: - xfail_marker = pytest.mark.xfail( - raises=DiscontiguousArrayError, - reason="zarrs discontinuous writes are not supported.", - ) - item.add_marker(xfail_marker) - if item.name in zarrs_python_no_collapsed_dim: - xfail_marker = pytest.mark.xfail( - raises=CollapsedDimensionError, - reason="zarrs vindexing with multiple contiguous arrays is not supported.", - ) - item.add_marker(xfail_marker) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 1af6f0f..70d5ffc 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -49,6 +49,8 @@ def fill_value() -> int: pytest.param(lambda x: x, id="vindex"), ] +zarr_formats = [2, 3] + def pytest_generate_tests(metafunc): old_pipeline_path = zarr.config.get("codec_pipeline.path") @@ -60,32 +62,38 @@ def pytest_generate_tests(metafunc): store_values = [] indexing_methods = [] ids = [] - for dimensionality in dimensionalities_: - indexers = non_numpy_indices if dimensionality > 2 else all_indices - for index_param_prod in product(indexers, repeat=dimensionality): - index = tuple(index_param.values[0] for index_param in index_param_prod) - # multi-ellipsis indexing is not supported - if sum(isinstance(i, EllipsisType) for i in index) > 1: - continue - for indexing_method_param in indexing_method_params: - arr = gen_arr(fill_value_, Path(tempfile.mktemp()), dimensionality) - indexing_method = indexing_method_param.values[0] - dimensionality_id = f"{dimensionality}d" - id = "-".join( - [indexing_method_param.id, dimensionality_id] - + [index_param.id for index_param in index_param_prod] + for format in zarr_formats: + for dimensionality in dimensionalities_: + indexers = non_numpy_indices if dimensionality > 2 else all_indices + for index_param_prod in product(indexers, repeat=dimensionality): + index = tuple( + index_param.values[0] for index_param in index_param_prod ) - ids.append(id) - store_values.append( - gen_store_values( - indexing_method, - index, - full_array((axis_size_,) * dimensionality), + # multi-ellipsis indexing is not supported + if sum(isinstance(i, EllipsisType) for i in index) > 1: + continue + for indexing_method_param in indexing_method_params: + arr = gen_arr( + fill_value_, Path(tempfile.mktemp()), dimensionality, format ) - ) - indexing_methods.append(indexing_method) - indices.append(index) - arrs.append(arr) + indexing_method = indexing_method_param.values[0] + dimensionality_id = f"{dimensionality}d" + id = "-".join( + [indexing_method_param.id, dimensionality_id] + + [index_param.id for index_param in index_param_prod] + + [f"v{format}"] + ) + ids.append(id) + store_values.append( + gen_store_values( + indexing_method, + index, + full_array((axis_size_,) * dimensionality), + ) + ) + indexing_methods.append(indexing_method) + indices.append(index) + arrs.append(arr) # array is used as param name to prevent collision with arr fixture metafunc.parametrize( ["array", "index", "store_values", "indexing_method"], @@ -139,14 +147,17 @@ def maybe_convert( return full_array[index] -def gen_arr(fill_value, tmp_path, dimensionality) -> zarr.Array: +def gen_arr(fill_value, tmp_path, dimensionality, format) -> zarr.Array: return zarr.create( (axis_size_,) * dimensionality, store=LocalStore(root=tmp_path / ".zarr"), chunks=(chunk_size_,) * dimensionality, dtype=np.int16, fill_value=fill_value, - codecs=[zarr.codecs.BytesCodec(), zarr.codecs.BloscCodec()], + codecs=[zarr.codecs.BytesCodec(), zarr.codecs.BloscCodec()] + if format == 3 + else None, + zarr_format=format, ) @@ -155,9 +166,14 @@ def dimensionality(request): return request.param +@pytest.fixture(params=zarr_formats) +def format(request): + return request.param + + @pytest.fixture -def arr(dimensionality, tmp_path) -> zarr.Array: - return gen_arr(fill_value_, tmp_path, dimensionality) +def arr(dimensionality, tmp_path, format) -> zarr.Array: + return gen_arr(fill_value_, tmp_path, dimensionality, format) def test_fill_value(arr: zarr.Array): @@ -196,6 +212,28 @@ def test_roundtrip( ), res +def test_ellipsis_indexing_invalid(arr: zarr.Array): + if len(arr.shape) <= 2: + pytest.skip( + "Ellipsis indexing works for 1D and 2D arrays in zarr-python despite a shape mismatch" + ) + stored_value = np.array([1, 2, 3]) + with pytest.raises(ValueError): # noqa: PT011 + # zarrs-python error: ValueError: operands could not be broadcast together with shapes (4,) (3,) + # numpy error: ValueError: could not broadcast input array from shape (3,) into shape (4,) + arr[2, ...] = stored_value + + +def test_pickle(arr: zarr.Array, tmp_path: Path): + arr[:] = np.arange(reduce(operator.mul, arr.shape, 1)).reshape(arr.shape) + expected = arr[:] + with Path.open(tmp_path / "arr.pickle", "wb") as f: + pickle.dump(arr._async_array.codec_pipeline, f) + with Path.open(tmp_path / "arr.pickle", "rb") as f: + object.__setattr__(arr._async_array, "codec_pipeline", pickle.load(f)) + assert (arr[:] == expected).all() + + @contextmanager def use_zarr_default_codec_reader(): zarr.config.set( @@ -218,25 +256,3 @@ def test_roundtrip_read_only_zarrs( assert np.all( res == store_values, ), res - - -def test_ellipsis_indexing_invalid(arr: zarr.Array): - if len(arr.shape) <= 2: - pytest.skip( - "Ellipsis indexing works for 1D and 2D arrays in zarr-python despite a shape mismatch" - ) - stored_value = np.array([1, 2, 3]) - with pytest.raises(ValueError): # noqa: PT011 - # zarrs-python error: ValueError: operands could not be broadcast together with shapes (4,) (3,) - # numpy error: ValueError: could not broadcast input array from shape (3,) into shape (4,) - arr[2, ...] = stored_value - - -def test_pickle(arr: zarr.Array, tmp_path: Path): - arr[:] = np.arange(reduce(operator.mul, arr.shape, 1)).reshape(arr.shape) - expected = arr[:] - with Path.open(tmp_path / "arr.pickle", "wb") as f: - pickle.dump(arr._async_array.codec_pipeline, f) - with Path.open(tmp_path / "arr.pickle", "rb") as f: - object.__setattr__(arr._async_array, "codec_pipeline", pickle.load(f)) - assert (arr[:] == expected).all() diff --git a/tests/test_v2.py b/tests/test_v2.py new file mode 100644 index 0000000..fd8592e --- /dev/null +++ b/tests/test_v2.py @@ -0,0 +1,346 @@ +import json +from collections.abc import Iterator +from pathlib import Path +from typing import Any, Literal + +import numcodecs.vlen +import numpy as np +import pytest +import zarr +import zarr.core.buffer +import zarr.storage +from numcodecs import Delta +from numcodecs.blosc import Blosc +from numcodecs.zstd import Zstd +from zarr import config +from zarr.abc.store import Store +from zarr.core.buffer.core import default_buffer_prototype +from zarr.core.sync import sync +from zarr.storage import LocalStore, StorePath + + +@pytest.fixture +async def store(tmp_path) -> Iterator[StorePath]: + return StorePath(await LocalStore.open(tmp_path)) + + +def test_simple(store: StorePath) -> None: + data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) + + a = zarr.create_array( + store / "simple_v2", + zarr_format=2, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + ) + + a[:, :] = data + assert np.array_equal(data, a[:, :]) + + +@pytest.mark.parametrize( + ("dtype", "fill_value"), + [ + ("bool", False), + ("int64", 0), + ("float64", 0.0), + ("|S1", b""), + ("|U1", ""), + ("object", ""), + (str, ""), + ], +) +def test_implicit_fill_value(store: LocalStore, dtype: str, fill_value: Any) -> None: + arr = zarr.create( + store=store, shape=(4,), fill_value=None, zarr_format=2, dtype=dtype + ) + assert arr.metadata.fill_value is None + assert arr.metadata.to_dict()["fill_value"] is None + result = arr[:] + numpy_dtype = np.dtype(object) if dtype is str else np.dtype(dtype) + expected = np.full(arr.shape, fill_value, dtype=numpy_dtype) + np.testing.assert_array_equal(result, expected) + + +def test_codec_pipeline(tmp_path) -> None: + # https://github.com/zarr-developers/zarr-python/issues/2243 + store = LocalStore(tmp_path) + array = zarr.create( + store=store, + shape=(1,), + dtype="i4", + zarr_format=2, + filters=[Delta(dtype="i4").get_config()], + compressor=Blosc().get_config(), + ) + array[:] = 1 + result = array[:] + expected = np.ones(1) + np.testing.assert_array_equal(result, expected) + + +@pytest.mark.parametrize( + ("dtype", "expected_dtype", "fill_value", "fill_value_encoding"), + [ + ("|S", "|S0", b"X", "WA=="), + ("|V", "|V0", b"X", "WA=="), + ("|V10", "|V10", b"X", "WAAAAAAAAAAAAA=="), + ], +) +async def test_v2_encode_decode( + dtype, expected_dtype, fill_value, fill_value_encoding, tmp_path +) -> None: + with config.set( + { + "array.v2_default_filters.bytes": [{"id": "vlen-bytes"}], + "array.v2_default_compressor.bytes": None, + } + ): + store = zarr.storage.LocalStore(tmp_path) + g = zarr.group(store=store, zarr_format=2) + g.create_array( + name="foo", + shape=(3,), + chunks=(3,), + dtype=dtype, + fill_value=fill_value, + compressor=None, + ) + + result = await store.get( + "foo/.zarray", zarr.core.buffer.default_buffer_prototype() + ) + assert result is not None + + serialized = json.loads(result.to_bytes()) + expected = { + "chunks": [3], + "compressor": None, + "dtype": expected_dtype, + "fill_value": fill_value_encoding, + "filters": [{"id": "vlen-bytes"}] if dtype == "|S" else None, + "order": "C", + "shape": [3], + "zarr_format": 2, + "dimension_separator": ".", + } + assert serialized == expected + + data = zarr.open_array(store=store, path="foo")[:] + expected = np.full((3,), b"X", dtype=dtype) + np.testing.assert_equal(data, expected) + + +@pytest.mark.parametrize("dtype_value", [["|S", b"Y"], ["|U", "Y"], ["O", b"Y"]]) +def test_v2_encode_decode_with_data(dtype_value, tmp_path): + dtype, value = dtype_value + with config.set( + { + "array.v2_default_filters": { + "string": [{"id": "vlen-utf8"}], + "bytes": [{"id": "vlen-bytes"}], + }, + } + ): + expected = np.full((3,), value, dtype=dtype) + a = zarr.create( + store=tmp_path, + shape=(3,), + zarr_format=2, + dtype=dtype, + ) + a[:] = expected + data = a[:] + np.testing.assert_equal(data, expected) + + +@pytest.mark.parametrize("dtype", [str, "str"]) +async def test_create_dtype_str(dtype: Any, tmp_path) -> None: + # see https://github.com/zarr-developers/zarr-python/issues/2627 for why this test + # is probably wrong + arr = zarr.create(store=tmp_path, shape=3, dtype=dtype, zarr_format=2) + assert arr.dtype.kind == "O" + assert arr.metadata.to_dict()["dtype"] == "|O" + assert arr.metadata.filters == (numcodecs.vlen.VLenBytes(),) + arr[:] = [b"a", b"bb", b"ccc"] + result = arr[:] + np.testing.assert_array_equal( + result, np.array([b"a", b"bb", b"ccc"], dtype="object") + ) + + +@pytest.mark.parametrize( + "filters", [[], [numcodecs.Delta(dtype=" None: + array_fixture = [42] + with config.set({"array.order": order}): + arr = zarr.create( + store=tmp_path, shape=1, dtype=" None: + store = LocalStore(tmp_path / "a_store") + arr = zarr.create_array( + store, + shape=(10, 8), + chunks=(3, 3), + fill_value=np.nan, + dtype="float64", + zarr_format=2, + filters=None, + compressors=None, + overwrite=True, + order=array_order, + config={"order": memory_order}, + ) + + # Non-contiguous write + a = np.arange(arr.shape[0] * arr.shape[1]).reshape(arr.shape, order=data_order) + arr[6:9, 3:6] = a[6:9, 3:6] # The slice on the RHS is important + np.testing.assert_array_equal(arr[6:9, 3:6], a[6:9, 3:6]) + + np.testing.assert_array_equal( + a[6:9, 3:6], + np.frombuffer( + sync(store.get("2.1", default_buffer_prototype())).to_bytes(), + dtype="float64", + ).reshape((3, 3), order=array_order), + ) + if memory_order == "F": + assert (arr[6:9, 3:6]).flags.f_contiguous + else: + assert (arr[6:9, 3:6]).flags.c_contiguous + + store = LocalStore(tmp_path / "other_store") + arr = zarr.create_array( + store, + shape=(10, 8), + chunks=(3, 3), + fill_value=np.nan, + dtype="float64", + zarr_format=2, + compressors=None, + filters=None, + overwrite=True, + order=array_order, + config={"order": memory_order}, + ) + + # Contiguous write + a = np.arange(9).reshape((3, 3), order=data_order) + if data_order == "F": + assert a.flags.f_contiguous + else: + assert a.flags.c_contiguous + arr[6:9, 3:6] = a + np.testing.assert_array_equal(arr[6:9, 3:6], a) + + +def test_default_compressor_deprecation_warning(): + with pytest.warns(DeprecationWarning, match="default_compressor is deprecated"): + zarr.storage.default_compressor = "zarr.codecs.zstd.ZstdCodec()" + + +@pytest.mark.parametrize( + "dtype_expected", + [ + ["b", "zstd", None], + ["i", "zstd", None], + ["f", "zstd", None], + ["|S1", "zstd", "vlen-bytes"], + ["|U1", "zstd", "vlen-utf8"], + ], +) +def test_default_filters_and_compressor(dtype_expected: Any) -> None: + with config.set( + { + "array.v2_default_compressor": { + "numeric": {"id": "zstd", "level": "0"}, + "string": {"id": "zstd", "level": "0"}, + "bytes": {"id": "zstd", "level": "0"}, + }, + "array.v2_default_filters": { + "numeric": [], + "string": [{"id": "vlen-utf8"}], + "bytes": [{"id": "vlen-bytes"}], + }, + } + ): + dtype, expected_compressor, expected_filter = dtype_expected + arr = zarr.create(shape=(3,), path="foo", store={}, zarr_format=2, dtype=dtype) + assert arr.metadata.compressor.codec_id == expected_compressor + if expected_filter is not None: + assert arr.metadata.filters[0].codec_id == expected_filter + + +@pytest.mark.parametrize("fill_value", [None, (b"", 0, 0.0)], ids=["no_fill", "fill"]) +def test_structured_dtype_roundtrip(fill_value, tmp_path) -> None: + a = np.array( + [(b"aaa", 1, 4.2), (b"bbb", 2, 8.4), (b"ccc", 3, 12.6)], + dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")], + ) + array_path = tmp_path / "data.zarr" + za = zarr.create( + shape=(3,), + store=array_path, + chunks=(2,), + fill_value=fill_value, + zarr_format=2, + dtype=a.dtype, + ) + if fill_value is not None: + assert (np.array([fill_value] * a.shape[0], dtype=a.dtype) == za[:]).all() + za[...] = a + za = zarr.open_array(store=array_path) + assert (a == za[:]).all()