From da9188ba03710a70e4ba193c5227476d84dfd547 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Mon, 7 Oct 2024 08:43:05 +1100 Subject: [PATCH 1/6] Change Zarr V3 chunk key encoding to `v2` with `.` separator This matches the encoding in `manifest.json`. --- virtualizarr/writers/zarr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virtualizarr/writers/zarr.py b/virtualizarr/writers/zarr.py index b3dc8f1a..4aa6e85a 100644 --- a/virtualizarr/writers/zarr.py +++ b/virtualizarr/writers/zarr.py @@ -92,8 +92,8 @@ def zarr_v3_array_metadata(zarray: ZArray, dim_names: list[str], attrs: dict) -> "configuration": {"chunk_shape": metadata.pop("chunks")}, } metadata["chunk_key_encoding"] = { - "name": "default", - "configuration": {"separator": "/"}, + "name": "v2", + "configuration": {"separator": "."}, } metadata["codecs"] = zarray._v3_codec_pipeline() metadata.pop("filters") From 3de4d5d133b198effbdca1c9344788a3f48114f2 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Mon, 7 Oct 2024 09:01:04 +1100 Subject: [PATCH 2/6] Fix Zarr V3 non-finite float fill value encoding --- virtualizarr/zarr.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index f62b1269..56660d1a 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -66,6 +66,14 @@ def __post_init__(self) -> None: if self.fill_value is None: self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype.kind, 0.0) + # Handle non-finite fill values + if self.fill_value is np.nan: + self.fill_value = "NaN" + elif self.fill_value is np.inf: + self.fill_value = "Infinity" + elif self.fill_value is -np.inf: + self.fill_value = "-Infinity" + @property def codec(self) -> Codec: """For comparison against other arrays.""" From 425b5b446f420167fa6ceaafbe5cdd0704c33c0c Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Mon, 7 Oct 2024 09:12:01 +1100 Subject: [PATCH 3/6] Fix Zarr V3 NaN fill value for integer arrays --- virtualizarr/zarr.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 56660d1a..75346750 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -67,12 +67,14 @@ def __post_init__(self) -> None: self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype.kind, 0.0) # Handle non-finite fill values - if self.fill_value is np.nan: - self.fill_value = "NaN" - elif self.fill_value is np.inf: - self.fill_value = "Infinity" - elif self.fill_value is -np.inf: - self.fill_value = "-Infinity" + if not isinstance(self.fill_value, list): + if self.fill_value is np.nan: + self.fill_value = "NaN" + elif self.fill_value is np.inf: + self.fill_value = "Infinity" + elif self.fill_value is -np.inf: # TODO: does this work? + self.fill_value = "-Infinity" + # TODO: Handle other data types (complex, etc.) @property def codec(self) -> Codec: @@ -81,10 +83,22 @@ def codec(self) -> Codec: @classmethod def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": + dtype = np.dtype(decoded_arr_refs_zarray["dtype"]) + # coerce type of fill_value as kerchunk can be inconsistent with this fill_value = decoded_arr_refs_zarray["fill_value"] if fill_value is None or fill_value == "NaN" or fill_value == "nan": - fill_value = np.nan + if dtype.kind == "f": + fill_value = np.nan + elif dtype.kind == "c": + fill_value = [np.nan, np.nan] + elif dtype.kind == "i": + fill_value = 0 + else: + # TODO: Handle other data types + raise ValueError( + f"Fill value {fill_value} is not valid for dtype {dtype}" + ) compressor = decoded_arr_refs_zarray["compressor"] zarr_format = int(decoded_arr_refs_zarray["zarr_format"]) From 479f7bc2c32f884d764eb5ce5e3db38d3a1f114c Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Mon, 7 Oct 2024 09:12:33 +1100 Subject: [PATCH 4/6] Zarr V3 default to little endian configuration for `bytes` codec --- virtualizarr/zarr.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 75346750..af996602 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -207,8 +207,10 @@ def _v3_codec_pipeline(self) -> list: # https://github.com/zarr-developers/zarr-python/pull/1944#issuecomment-2151994097 # "If no ArrayBytesCodec is supplied, we can auto-add a BytesCodec" bytes = dict( - name="bytes", configuration={} - ) # TODO need to handle endianess configuration + name="bytes", configuration={ + "endian": "little" # TODO need to handle endianess configuration, but little is a sensible default for now + } + ) # The order here is significant! # [ArrayArray] -> ArrayBytes -> [BytesBytes] From 2a5dc68c3d2366d50c0f4619f9b9bc9930f93b49 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Mon, 7 Oct 2024 09:52:07 +1100 Subject: [PATCH 5/6] Fix tests using `null` fill value or `nan` fill value for integer data type --- virtualizarr/tests/test_kerchunk.py | 2 +- virtualizarr/tests/test_manifests/test_array.py | 6 +++--- virtualizarr/tests/test_readers/test_kerchunk.py | 2 +- virtualizarr/tests/test_writers/test_kerchunk.py | 8 ++++---- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py index 2442ec8d..668ae118 100644 --- a/virtualizarr/tests/test_kerchunk.py +++ b/virtualizarr/tests/test_kerchunk.py @@ -23,7 +23,7 @@ def test_kerchunk_roundtrip_in_memory_no_concat(): chunks=(2, 2), compressor=None, filters=None, - fill_value=np.nan, + fill_value=0, order="C", ), chunkmanifest=manifest, diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index 6d5ede79..f1c85ea7 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -37,7 +37,7 @@ def test_create_manifestarray(self): def test_create_manifestarray_from_kerchunk_refs(self): arr_refs = { - ".zarray": '{"chunks":[2,3],"compressor":null,"dtype":" Date: Sun, 6 Oct 2024 23:07:09 +0000 Subject: [PATCH 6/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- virtualizarr/tests/test_manifests/test_array.py | 2 +- virtualizarr/tests/test_readers/test_kerchunk.py | 2 +- virtualizarr/zarr.py | 7 ++++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index f1c85ea7..89ba019c 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -46,7 +46,7 @@ def test_create_manifestarray_from_kerchunk_refs(self): assert marr.chunks == (2, 3) assert marr.dtype == np.dtype("int64") assert marr.zarray.compressor is None - assert marr.zarray.fill_value is 0 + assert marr.zarray.fill_value == 0 assert marr.zarray.filters is None assert marr.zarray.order == "C" diff --git a/virtualizarr/tests/test_readers/test_kerchunk.py b/virtualizarr/tests/test_readers/test_kerchunk.py index 3b6819d0..455e1dbf 100644 --- a/virtualizarr/tests/test_readers/test_kerchunk.py +++ b/virtualizarr/tests/test_readers/test_kerchunk.py @@ -37,7 +37,7 @@ def test_dataset_from_df_refs(): assert da.data.zarray.compressor is None assert da.data.zarray.filters is None - assert da.data.zarray.fill_value is 0 + assert da.data.zarray.fill_value == 0 assert da.data.zarray.order == "C" assert da.data.manifest.dict() == { diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index af996602..f98c0e09 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -72,7 +72,7 @@ def __post_init__(self) -> None: self.fill_value = "NaN" elif self.fill_value is np.inf: self.fill_value = "Infinity" - elif self.fill_value is -np.inf: # TODO: does this work? + elif self.fill_value is -np.inf: # TODO: does this work? self.fill_value = "-Infinity" # TODO: Handle other data types (complex, etc.) @@ -207,9 +207,10 @@ def _v3_codec_pipeline(self) -> list: # https://github.com/zarr-developers/zarr-python/pull/1944#issuecomment-2151994097 # "If no ArrayBytesCodec is supplied, we can auto-add a BytesCodec" bytes = dict( - name="bytes", configuration={ + name="bytes", + configuration={ "endian": "little" # TODO need to handle endianess configuration, but little is a sensible default for now - } + }, ) # The order here is significant!