From a2320a07d92ba814985c59ec4173d324ce914a87 Mon Sep 17 00:00:00 2001 From: Nathan Zimmerman Date: Mon, 24 Feb 2025 12:38:58 -0600 Subject: [PATCH] Unify metadata v2 fill value parsing --- src/zarr/core/metadata/v2.py | 84 ++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 46 deletions(-) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index f5520c9055..a28181da04 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -109,6 +109,29 @@ def shards(self) -> ChunkCoords | None: return None def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: + def _serialize_fill_value(fv: Any) -> JSON: + if self.fill_value is None: + pass + elif self.dtype.kind in "SV": + # There's a relationship between self.dtype and self.fill_value + # that mypy isn't aware of. The fact that we have S or V dtype here + # means we should have a bytes-type fill_value. + fv = base64.standard_b64encode(cast(bytes, self.fill_value)).decode("ascii") + elif isinstance(fv, np.datetime64): + if np.isnat(fv): + fv = "NaT" + else: + fv = np.datetime_as_string(fv) + elif isinstance(fv, numbers.Real): + float_fv = float(fv) + if np.isnan(float_fv): + fv = "NaN" + elif np.isinf(float_fv): + fv = "Infinity" if float_fv > 0 else "-Infinity" + elif isinstance(fv, numbers.Complex): + fv = [_serialize_fill_value(fv.real), _serialize_fill_value(fv.imag)] + return cast(JSON, fv) + def _json_convert( o: Any, ) -> Any: @@ -147,6 +170,7 @@ def _json_convert( raise TypeError zarray_dict = self.to_dict() + zarray_dict["fill_value"] = _serialize_fill_value(zarray_dict["fill_value"]) zattrs_dict = zarray_dict.pop("attributes", {}) json_indent = config.get("json_indent") return { @@ -166,26 +190,7 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: _data = data.copy() # Check that the zarr_format attribute is correct. _ = parse_zarr_format(_data.pop("zarr_format")) - dtype = parse_dtype(_data["dtype"]) - if dtype.kind in "SV": - fill_value_encoded = _data.get("fill_value") - if fill_value_encoded is not None: - fill_value: Any = base64.standard_b64decode(fill_value_encoded) - _data["fill_value"] = fill_value - else: - fill_value = _data.get("fill_value") - if fill_value is not None: - if np.issubdtype(dtype, np.datetime64): - if fill_value == "NaT": - _data["fill_value"] = np.array("NaT", dtype=dtype)[()] - else: - _data["fill_value"] = np.array(fill_value, dtype=dtype)[()] - elif dtype.kind == "c" and isinstance(fill_value, list) and len(fill_value) == 2: - val = complex(float(fill_value[0]), float(fill_value[1])) - _data["fill_value"] = np.array(val, dtype=dtype)[()] - elif dtype.kind in "f" and fill_value in {"NaN", "Infinity", "-Infinity"}: - _data["fill_value"] = np.array(fill_value, dtype=dtype)[()] # zarr v2 allowed arbitrary keys in the metadata. # Filter the keys to only those expected by the constructor. expected = {x.name for x in fields(cls)} @@ -206,33 +211,8 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: return cls(**_data) def to_dict(self) -> dict[str, JSON]: - def _sanitize_fill_value(fv: Any) -> JSON: - if fv is None: - return fv - elif isinstance(fv, np.datetime64): - if np.isnat(fv): - return "NaT" - return np.datetime_as_string(fv) - elif isinstance(fv, numbers.Real): - float_fv = float(fv) - if np.isnan(float_fv): - fv = "NaN" - elif np.isinf(float_fv): - fv = "Infinity" if float_fv > 0 else "-Infinity" - elif isinstance(fv, numbers.Complex): - fv = [_sanitize_fill_value(fv.real), _sanitize_fill_value(fv.imag)] - return cast(JSON, fv) - zarray_dict = super().to_dict() - if self.dtype.kind in "SV" and self.fill_value is not None: - # There's a relationship between self.dtype and self.fill_value - # that mypy isn't aware of. The fact that we have S or V dtype here - # means we should have a bytes-type fill_value. - fill_value = base64.standard_b64encode(cast(bytes, self.fill_value)).decode("ascii") - zarray_dict["fill_value"] = fill_value - - zarray_dict["fill_value"] = _sanitize_fill_value(zarray_dict["fill_value"]) _ = zarray_dict.pop("dtype") dtype_json: JSON # In the case of zarr v2, the simplest i.e., '|VXX' dtype is represented as a string @@ -330,7 +310,7 @@ def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata: return data -def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any: +def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any: """ Parse a potential fill value into a value that is compatible with the provided dtype. @@ -345,6 +325,7 @@ def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any: ------- An instance of `dtype`, or `None`, or any python object (in the case of an object dtype) """ + if fill_value is None or dtype.hasobject: # no fill value pass @@ -352,7 +333,6 @@ def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any: # this should be compatible across numpy versions for any array type, including # structured arrays fill_value = np.zeros((), dtype=dtype)[()] - elif dtype.kind == "U": # special case unicode because of encoding issues on Windows if passed through numpy # https://github.com/alimanfoo/zarr/pull/172#issuecomment-343782713 @@ -361,6 +341,18 @@ def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any: raise ValueError( f"fill_value {fill_value!r} is not valid for dtype {dtype}; must be a unicode string" ) + elif dtype.kind in "SV" and isinstance(fill_value, str): + fill_value = base64.standard_b64decode(fill_value) + elif np.issubdtype(dtype, np.datetime64): + if fill_value == "NaT": + fill_value = np.array("NaT", dtype=dtype)[()] + else: + fill_value = np.array(fill_value, dtype=dtype)[()] + elif dtype.kind == "c" and isinstance(fill_value, list) and len(fill_value) == 2: + complex_val = complex(float(fill_value[0]), float(fill_value[1])) + fill_value = np.array(complex_val, dtype=dtype)[()] + elif dtype.kind in "f" and fill_value in {"NaN", "Infinity", "-Infinity"}: + fill_value = np.array(fill_value, dtype=dtype)[()] else: try: if isinstance(fill_value, bytes) and dtype.kind == "V":