Skip to content

Commit

Permalink
fix: V2 codec pipeline creation
Browse files Browse the repository at this point in the history
  • Loading branch information
LDeakin committed Feb 3, 2025
1 parent 45efee1 commit 59e60fc
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 13 deletions.
33 changes: 22 additions & 11 deletions python/zarrs/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,31 @@ def get_codec_pipeline_impl(codec_metadata_json: str) -> CodecPipelineImpl:


def codecs_to_dict(codecs: Iterable[Codec]) -> Generator[dict[str, Any], None, None]:
# See https://github.com/LDeakin/zarrs/blob/9070e12ea06c297532347af3668be9927ba35fa1/zarrs_metadata/src/v2_to_v3.rs#L69
for codec in codecs:
if codec.__class__.__name__ == "V2Codec":
codec_dict = codec.to_dict()
compressor = {}
has_array_to_bytes = False
if codec_dict.get("filters", None) is not None:
for filter in codec_dict.get("filters"):
filter = filter.get_config()
name = filter.pop("id")
if name in [
"vlen-array",
"vlen-bytes",
"vlen-utf8",
"zfpy",
"pcodec",
]:
has_array_to_bytes = True
as_dict = {"name": name, "configuration": filter}
yield as_dict
if not has_array_to_bytes:
yield BytesCodec().to_dict()
if codec_dict.get("compressor", None) is not None:
compressor = codec_dict["compressor"].get_config()
if compressor.get("id", None) == "zstd":
yield {
as_dict = {
"name": "zstd",
"configuration": {
"level": int(compressor["level"]),
Expand All @@ -85,16 +102,10 @@ def codecs_to_dict(codecs: Iterable[Codec]) -> Generator[dict[str, Any], None, N
as_dict["shuffle"] = "shuffle"
case 2:
as_dict["shuffle"] = "bitshuffle"
yield as_dict
else:
yield compressor
elif codec_dict.get("filter", None) is not None:
filter_ = codec_dict["filter"].get_config()
yield filter_
# TODO: get the endianness added to V2Codec API
# TODO: how to handle this with strings, which don't need this but zarrs
# complains about its absence if its not there
yield BytesCodec().to_dict()
name = compressor.pop("id")
as_dict = {"name": name, "configuration": compressor}
yield as_dict
else:
yield codec.to_dict()

Expand Down
4 changes: 4 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,9 @@ impl CodecPipelineImpl {
};

py.allow_threads(move || {
// FIXME: the `decode_into` methods only support fixed length data types.
// For variable length data types, need a codepath with non `_into` methods.
// Collect all the subsets and copy into value on the Python side?
let update_chunk_subset = |item: chunk_item::WithSubset| {
// See zarrs::array::Array::retrieve_chunk_subset_into
if item.chunk_subset.start().iter().all(|&o| o == 0)
Expand Down Expand Up @@ -351,6 +354,7 @@ impl CodecPipelineImpl {
// Get input array
let input_slice = Self::nparray_to_slice(value)?;
let input = if value.ndim() > 0 {
// FIXME: Handle variable length data types, convert value to bytes and offsets
InputValue::Array(ArrayBytes::new_flen(Cow::Borrowed(input_slice)))
} else {
InputValue::Constant(FillValue::new(input_slice.to_vec()))
Expand Down
11 changes: 9 additions & 2 deletions tests/test_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,10 +158,17 @@ def test_v2_encode_decode_with_data(dtype_value, tmp_path):

@pytest.mark.parametrize("dtype", [str, "str"])
async def test_create_dtype_str(dtype: Any, tmp_path: Path) -> None:
arr = zarr.create(store=tmp_path, shape=3, dtype=dtype, zarr_format=2)
arr = zarr.create(
store=tmp_path,
shape=3,
dtype=dtype,
zarr_format=2,
# https://github.com/zarr-developers/zarr-python/issues/2627
filters=zarr.codecs.vlen_utf8.VLenUTF8(),
)
assert arr.dtype.kind == "O"
assert arr.metadata.to_dict()["dtype"] == "|O"
assert arr.metadata.filters == (numcodecs.vlen.VLenBytes(),)
assert arr.metadata.filters == (zarr.codecs.vlen_utf8.VLenUTF8(),)
arr[:] = [b"a", b"bb", b"ccc"]
result = arr[:]
np.testing.assert_array_equal(
Expand Down

0 comments on commit 59e60fc

Please sign in to comment.