Skip to content

Commit

Permalink
Unify metadata v2 fill value parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
moradology committed Feb 24, 2025
1 parent 260cfbc commit a2320a0
Showing 1 changed file with 38 additions and 46 deletions.
84 changes: 38 additions & 46 deletions src/zarr/core/metadata/v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,29 @@ def shards(self) -> ChunkCoords | None:
return None

def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]:
def _serialize_fill_value(fv: Any) -> JSON:
if self.fill_value is None:
pass
elif self.dtype.kind in "SV":
# There's a relationship between self.dtype and self.fill_value
# that mypy isn't aware of. The fact that we have S or V dtype here
# means we should have a bytes-type fill_value.
fv = base64.standard_b64encode(cast(bytes, self.fill_value)).decode("ascii")
elif isinstance(fv, np.datetime64):
if np.isnat(fv):
fv = "NaT"
else:
fv = np.datetime_as_string(fv)
elif isinstance(fv, numbers.Real):
float_fv = float(fv)
if np.isnan(float_fv):
fv = "NaN"
elif np.isinf(float_fv):
fv = "Infinity" if float_fv > 0 else "-Infinity"
elif isinstance(fv, numbers.Complex):
fv = [_serialize_fill_value(fv.real), _serialize_fill_value(fv.imag)]
return cast(JSON, fv)

def _json_convert(
o: Any,
) -> Any:
Expand Down Expand Up @@ -147,6 +170,7 @@ def _json_convert(
raise TypeError

zarray_dict = self.to_dict()
zarray_dict["fill_value"] = _serialize_fill_value(zarray_dict["fill_value"])
zattrs_dict = zarray_dict.pop("attributes", {})
json_indent = config.get("json_indent")
return {
Expand All @@ -166,26 +190,7 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
_data = data.copy()
# Check that the zarr_format attribute is correct.
_ = parse_zarr_format(_data.pop("zarr_format"))
dtype = parse_dtype(_data["dtype"])

if dtype.kind in "SV":
fill_value_encoded = _data.get("fill_value")
if fill_value_encoded is not None:
fill_value: Any = base64.standard_b64decode(fill_value_encoded)
_data["fill_value"] = fill_value
else:
fill_value = _data.get("fill_value")
if fill_value is not None:
if np.issubdtype(dtype, np.datetime64):
if fill_value == "NaT":
_data["fill_value"] = np.array("NaT", dtype=dtype)[()]
else:
_data["fill_value"] = np.array(fill_value, dtype=dtype)[()]
elif dtype.kind == "c" and isinstance(fill_value, list) and len(fill_value) == 2:
val = complex(float(fill_value[0]), float(fill_value[1]))
_data["fill_value"] = np.array(val, dtype=dtype)[()]
elif dtype.kind in "f" and fill_value in {"NaN", "Infinity", "-Infinity"}:
_data["fill_value"] = np.array(fill_value, dtype=dtype)[()]
# zarr v2 allowed arbitrary keys in the metadata.
# Filter the keys to only those expected by the constructor.
expected = {x.name for x in fields(cls)}
Expand All @@ -206,33 +211,8 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata:
return cls(**_data)

def to_dict(self) -> dict[str, JSON]:
def _sanitize_fill_value(fv: Any) -> JSON:
if fv is None:
return fv
elif isinstance(fv, np.datetime64):
if np.isnat(fv):
return "NaT"
return np.datetime_as_string(fv)
elif isinstance(fv, numbers.Real):
float_fv = float(fv)
if np.isnan(float_fv):
fv = "NaN"
elif np.isinf(float_fv):
fv = "Infinity" if float_fv > 0 else "-Infinity"
elif isinstance(fv, numbers.Complex):
fv = [_sanitize_fill_value(fv.real), _sanitize_fill_value(fv.imag)]
return cast(JSON, fv)

zarray_dict = super().to_dict()

if self.dtype.kind in "SV" and self.fill_value is not None:
# There's a relationship between self.dtype and self.fill_value
# that mypy isn't aware of. The fact that we have S or V dtype here
# means we should have a bytes-type fill_value.
fill_value = base64.standard_b64encode(cast(bytes, self.fill_value)).decode("ascii")
zarray_dict["fill_value"] = fill_value

zarray_dict["fill_value"] = _sanitize_fill_value(zarray_dict["fill_value"])
_ = zarray_dict.pop("dtype")
dtype_json: JSON
# In the case of zarr v2, the simplest i.e., '|VXX' dtype is represented as a string
Expand Down Expand Up @@ -330,7 +310,7 @@ def parse_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata:
return data


def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any:
def parse_fill_value(fill_value: Any, dtype: np.dtype[Any]) -> Any:
"""
Parse a potential fill value into a value that is compatible with the provided dtype.
Expand All @@ -345,14 +325,14 @@ def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any:
-------
An instance of `dtype`, or `None`, or any python object (in the case of an object dtype)
"""

if fill_value is None or dtype.hasobject:
# no fill value
pass
elif not isinstance(fill_value, np.void) and fill_value == 0:
# this should be compatible across numpy versions for any array type, including
# structured arrays
fill_value = np.zeros((), dtype=dtype)[()]

elif dtype.kind == "U":
# special case unicode because of encoding issues on Windows if passed through numpy
# https://github.com/alimanfoo/zarr/pull/172#issuecomment-343782713
Expand All @@ -361,6 +341,18 @@ def parse_fill_value(fill_value: object, dtype: np.dtype[Any]) -> Any:
raise ValueError(
f"fill_value {fill_value!r} is not valid for dtype {dtype}; must be a unicode string"
)
elif dtype.kind in "SV" and isinstance(fill_value, str):
fill_value = base64.standard_b64decode(fill_value)
elif np.issubdtype(dtype, np.datetime64):
if fill_value == "NaT":
fill_value = np.array("NaT", dtype=dtype)[()]
else:
fill_value = np.array(fill_value, dtype=dtype)[()]
elif dtype.kind == "c" and isinstance(fill_value, list) and len(fill_value) == 2:
complex_val = complex(float(fill_value[0]), float(fill_value[1]))
fill_value = np.array(complex_val, dtype=dtype)[()]
elif dtype.kind in "f" and fill_value in {"NaN", "Infinity", "-Infinity"}:
fill_value = np.array(fill_value, dtype=dtype)[()]
else:
try:
if isinstance(fill_value, bytes) and dtype.kind == "V":
Expand Down

0 comments on commit a2320a0

Please sign in to comment.