fix: V2 codec pipeline creation

ilan-gold · Feb 3, 2025 · 59e60fc · 59e60fc
1 parent 45efee1
commit 59e60fc
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 13 deletions.
diff --git a/python/zarrs/pipeline.py b/python/zarrs/pipeline.py
@@ -51,14 +51,31 @@ def get_codec_pipeline_impl(codec_metadata_json: str) -> CodecPipelineImpl:
 
 
 def codecs_to_dict(codecs: Iterable[Codec]) -> Generator[dict[str, Any], None, None]:
+    # See https://github.com/LDeakin/zarrs/blob/9070e12ea06c297532347af3668be9927ba35fa1/zarrs_metadata/src/v2_to_v3.rs#L69
     for codec in codecs:
         if codec.__class__.__name__ == "V2Codec":
             codec_dict = codec.to_dict()
-            compressor = {}
+            has_array_to_bytes = False
+            if codec_dict.get("filters", None) is not None:
+                for filter in codec_dict.get("filters"):
+                    filter = filter.get_config()
+                    name = filter.pop("id")
+                    if name in [
+                        "vlen-array",
+                        "vlen-bytes",
+                        "vlen-utf8",
+                        "zfpy",
+                        "pcodec",
+                    ]:
+                        has_array_to_bytes = True
+                    as_dict = {"name": name, "configuration": filter}
+                    yield as_dict
+            if not has_array_to_bytes:
+                yield BytesCodec().to_dict()
             if codec_dict.get("compressor", None) is not None:
                 compressor = codec_dict["compressor"].get_config()
                 if compressor.get("id", None) == "zstd":
-                    yield {
+                    as_dict = {
                         "name": "zstd",
                         "configuration": {
                             "level": int(compressor["level"]),
@@ -85,16 +102,10 @@ def codecs_to_dict(codecs: Iterable[Codec]) -> Generator[dict[str, Any], None, N
                                 as_dict["shuffle"] = "shuffle"
                             case 2:
                                 as_dict["shuffle"] = "bitshuffle"
-                    yield as_dict
                 else:
-                    yield compressor
-            elif codec_dict.get("filter", None) is not None:
-                filter_ = codec_dict["filter"].get_config()
-                yield filter_
-            # TODO: get the endianness added to V2Codec API
-            # TODO: how to handle this with strings, which don't need this but zarrs
-            # complains about its absence if its not there
-            yield BytesCodec().to_dict()
+                    name = compressor.pop("id")
+                    as_dict = {"name": name, "configuration": compressor}
+                yield as_dict
         else:
             yield codec.to_dict()
 

diff --git a/src/lib.rs b/src/lib.rs
@@ -264,6 +264,9 @@ impl CodecPipelineImpl {
         };
 
         py.allow_threads(move || {
+            // FIXME: the `decode_into` methods only support fixed length data types.
+            // For variable length data types, need a codepath with non `_into` methods.
+            // Collect all the subsets and copy into value on the Python side?
             let update_chunk_subset = |item: chunk_item::WithSubset| {
                 // See zarrs::array::Array::retrieve_chunk_subset_into
                 if item.chunk_subset.start().iter().all(|&o| o == 0)
@@ -351,6 +354,7 @@ impl CodecPipelineImpl {
         // Get input array
         let input_slice = Self::nparray_to_slice(value)?;
         let input = if value.ndim() > 0 {
+            // FIXME: Handle variable length data types, convert value to bytes and offsets
             InputValue::Array(ArrayBytes::new_flen(Cow::Borrowed(input_slice)))
         } else {
             InputValue::Constant(FillValue::new(input_slice.to_vec()))

diff --git a/tests/test_v2.py b/tests/test_v2.py
@@ -158,10 +158,17 @@ def test_v2_encode_decode_with_data(dtype_value, tmp_path):
 
 @pytest.mark.parametrize("dtype", [str, "str"])
 async def test_create_dtype_str(dtype: Any, tmp_path: Path) -> None:
-    arr = zarr.create(store=tmp_path, shape=3, dtype=dtype, zarr_format=2)
+    arr = zarr.create(
+        store=tmp_path,
+        shape=3,
+        dtype=dtype,
+        zarr_format=2,
+        # https://github.com/zarr-developers/zarr-python/issues/2627
+        filters=zarr.codecs.vlen_utf8.VLenUTF8(),
+    )
     assert arr.dtype.kind == "O"
     assert arr.metadata.to_dict()["dtype"] == "|O"
-    assert arr.metadata.filters == (numcodecs.vlen.VLenBytes(),)
+    assert arr.metadata.filters == (zarr.codecs.vlen_utf8.VLenUTF8(),)
     arr[:] = [b"a", b"bb", b"ccc"]
     result = arr[:]
     np.testing.assert_array_equal(