Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Array creation with strings for filters, serializer, compressors #2839

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions changes/2839.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Array creation allows string representation of codecs for ``filters``, ``serializer``, and ``compressors``.
2 changes: 1 addition & 1 deletion src/zarr/api/synchronous.py
Original file line number Diff line number Diff line change
Expand Up @@ -792,7 +792,7 @@ def create_array(
chunk to bytes.

For Zarr format 3, a "filter" is a codec that takes an array and returns an array,
and these values must be instances of ``ArrayArrayCodec``, or dict representations
and these values must be instances of ``ArrayArrayCodec``, or dict or string representations
of ``ArrayArrayCodec``.
If no ``filters`` are provided, a default set of filters will be used.
These defaults can be changed by modifying the value of ``array.v3_default_filters``
Expand Down
27 changes: 10 additions & 17 deletions src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -3769,23 +3769,23 @@ def _get_default_codecs(


FiltersLike: TypeAlias = (
Iterable[dict[str, JSON] | ArrayArrayCodec | numcodecs.abc.Codec]
Iterable[dict[str, JSON] | str | ArrayArrayCodec | numcodecs.abc.Codec]
| ArrayArrayCodec
| Iterable[numcodecs.abc.Codec]
| numcodecs.abc.Codec
| Literal["auto"]
| str
| None
)
CompressorLike: TypeAlias = dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec | None
CompressorsLike: TypeAlias = (
Iterable[dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec]
Iterable[dict[str, JSON] | str | BytesBytesCodec | numcodecs.abc.Codec]
| dict[str, JSON]
| BytesBytesCodec
| numcodecs.abc.Codec
| Literal["auto"]
| str
| None
)
SerializerLike: TypeAlias = dict[str, JSON] | ArrayBytesCodec | Literal["auto"]
SerializerLike: TypeAlias = dict[str, JSON] | ArrayBytesCodec | str


class ShardsConfigParam(TypedDict):
Expand Down Expand Up @@ -4053,7 +4053,7 @@ async def create_array(
chunk to bytes.

For Zarr format 3, a "filter" is a codec that takes an array and returns an array,
and these values must be instances of ``ArrayArrayCodec``, or dict representations
and these values must be instances of ``ArrayArrayCodec``, or dict or string representations
of ``ArrayArrayCodec``.
If no ``filters`` are provided, a default set of filters will be used.
These defaults can be changed by modifying the value of ``array.v3_default_filters``
Expand Down Expand Up @@ -4264,24 +4264,13 @@ def _parse_chunk_encoding_v2(
elif isinstance(compressor, tuple | list) and len(compressor) == 1:
_compressor = parse_compressor(compressor[0])
else:
if isinstance(compressor, Iterable) and not isinstance(compressor, dict):
Copy link
Member Author

@brokkoli71 brokkoli71 Feb 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

msg = f"For Zarr format 2 arrays, the `compressor` must be a single codec. Got an iterable with type {type(compressor)} instead."
raise TypeError(msg)
_compressor = parse_compressor(compressor)

if filters is None:
_filters = None
elif filters == "auto":
_filters = default_filters
else:
if isinstance(filters, Iterable):
Copy link
Member Author

@brokkoli71 brokkoli71 Feb 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is also checked in parse_filters, so I removed it

for idx, f in enumerate(filters):
if not isinstance(f, numcodecs.abc.Codec):
msg = (
"For Zarr format 2 arrays, all elements of `filters` must be numcodecs codecs. "
f"Element at index {idx} has type {type(f)}, which is not a numcodecs codec."
)
raise TypeError(msg)
_filters = parse_filters(filters)

return _filters, _compressor
Expand All @@ -4305,6 +4294,8 @@ def _parse_chunk_encoding_v3(
out_array_array: tuple[ArrayArrayCodec, ...] = ()
elif filters == "auto":
out_array_array = default_array_array
elif isinstance(filters, str):
out_array_array = (_parse_array_array_codec(filters),)
else:
maybe_array_array: Iterable[Codec | dict[str, JSON]]
if isinstance(filters, dict | Codec):
Expand All @@ -4322,6 +4313,8 @@ def _parse_chunk_encoding_v3(
out_bytes_bytes: tuple[BytesBytesCodec, ...] = ()
elif compressors == "auto":
out_bytes_bytes = default_bytes_bytes
elif isinstance(compressors, str):
out_bytes_bytes = (_parse_bytes_bytes_codec(compressors),)
else:
maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]]
if isinstance(compressors, dict | Codec):
Expand Down
6 changes: 3 additions & 3 deletions src/zarr/core/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -1045,7 +1045,7 @@ async def create_array(
chunk to bytes.

For Zarr format 3, a "filter" is a codec that takes an array and returns an array,
and these values must be instances of ``ArrayArrayCodec``, or dict representations
and these values must be instances of ``ArrayArrayCodec``, or dict or string representations
of ``ArrayArrayCodec``.
If no ``filters`` are provided, a default set of filters will be used.
These defaults can be changed by modifying the value of ``array.v3_default_filters``
Expand Down Expand Up @@ -2280,7 +2280,7 @@ def create_array(
chunk to bytes.

For Zarr format 3, a "filter" is a codec that takes an array and returns an array,
and these values must be instances of ``ArrayArrayCodec``, or dict representations
and these values must be instances of ``ArrayArrayCodec``, or dict or string representations
of ``ArrayArrayCodec``.
If no ``filters`` are provided, a default set of filters will be used.
These defaults can be changed by modifying the value of ``array.v3_default_filters``
Expand Down Expand Up @@ -2678,7 +2678,7 @@ def array(
chunk to bytes.

For Zarr format 3, a "filter" is a codec that takes an array and returns an array,
and these values must be instances of ``ArrayArrayCodec``, or dict representations
and these values must be instances of ``ArrayArrayCodec``, or dict or string representations
of ``ArrayArrayCodec``.
If no ``filters`` are provided, a default set of filters will be used.
These defaults can be changed by modifying the value of ``array.v3_default_filters``
Expand Down
12 changes: 9 additions & 3 deletions src/zarr/core/metadata/v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,20 +246,24 @@

if data is None:
return data
if isinstance(data, str):
return (numcodecs.get_codec({"id": data}),)
if isinstance(data, Iterable):
for idx, val in enumerate(data):
if isinstance(val, numcodecs.abc.Codec):
out.append(val)
elif isinstance(val, dict):
out.append(numcodecs.get_codec(val))
elif isinstance(val, str):
out.append(numcodecs.get_codec({"id": val}))
else:
msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead."
msg = f"For Zarr format 2 arrays, all elements of `filters` must be a numcodecs.abc.Codec or a dict or str representation of numcodecs.abc.Codec. Got {type(val)} at index {idx} instead."
raise TypeError(msg)
return tuple(out)
# take a single codec instance and wrap it in a tuple
if isinstance(data, numcodecs.abc.Codec):
return (data,)
msg = f"Invalid filters. Expected None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead."
msg = f"For Zarr format 2 arrays, all elements of `filters` must be None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead."
raise TypeError(msg)


Expand All @@ -271,7 +275,9 @@
return data
if isinstance(data, dict):
return numcodecs.get_codec(data)
msg = f"Invalid compressor. Expected None, a numcodecs.abc.Codec, or a dict representation of a numcodecs.abc.Codec. Got {type(data)} instead."
if isinstance(data, str):
return numcodecs.get_codec({"id": data})
msg = f"For Zarr format 2 arrays, the `compressor` must be a single codec. Expected None, a numcodecs.abc.Codec, or a dict or str representation of a numcodecs.abc.Codec. Got {type(data)} instead."

Check warning on line 280 in src/zarr/core/metadata/v2.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/core/metadata/v2.py#L278-L280

Added lines #L278 - L280 were not covered by tests
raise ValueError(msg)


Expand Down
12 changes: 9 additions & 3 deletions src/zarr/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,14 +166,16 @@
return get_codec_class(data["name"]).from_dict(data) # type: ignore[arg-type]


def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec:
def _parse_bytes_bytes_codec(data: dict[str, JSON] | str | Codec) -> BytesBytesCodec:
"""
Normalize the input to a ``BytesBytesCodec`` instance.
If the input is already a ``BytesBytesCodec``, it is returned as is. If the input is a dict, it
is converted to a ``BytesBytesCodec`` instance via the ``_resolve_codec`` function.
"""
from zarr.abc.codec import BytesBytesCodec

if isinstance(data, str):
data = {"name": data, "configuration": {}}

Check warning on line 178 in src/zarr/registry.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/registry.py#L178

Added line #L178 was not covered by tests
if isinstance(data, dict):
result = _resolve_codec(data)
if not isinstance(result, BytesBytesCodec):
Expand All @@ -186,14 +188,16 @@
return result


def _parse_array_bytes_codec(data: dict[str, JSON] | Codec) -> ArrayBytesCodec:
def _parse_array_bytes_codec(data: dict[str, JSON] | str | Codec) -> ArrayBytesCodec:
"""
Normalize the input to a ``ArrayBytesCodec`` instance.
If the input is already a ``ArrayBytesCodec``, it is returned as is. If the input is a dict, it
is converted to a ``ArrayBytesCodec`` instance via the ``_resolve_codec`` function.
"""
from zarr.abc.codec import ArrayBytesCodec

if isinstance(data, str):
data = {"name": data, "configuration": {}}

Check warning on line 200 in src/zarr/registry.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/registry.py#L200

Added line #L200 was not covered by tests
if isinstance(data, dict):
result = _resolve_codec(data)
if not isinstance(result, ArrayBytesCodec):
Expand All @@ -206,14 +210,16 @@
return result


def _parse_array_array_codec(data: dict[str, JSON] | Codec) -> ArrayArrayCodec:
def _parse_array_array_codec(data: dict[str, JSON] | str | Codec) -> ArrayArrayCodec:
"""
Normalize the input to a ``ArrayArrayCodec`` instance.
If the input is already a ``ArrayArrayCodec``, it is returned as is. If the input is a dict, it
is converted to a ``ArrayArrayCodec`` instance via the ``_resolve_codec`` function.
"""
from zarr.abc.codec import ArrayArrayCodec

if isinstance(data, str):
data = {"name": data, "configuration": {}}

Check warning on line 222 in src/zarr/registry.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/registry.py#L222

Added line #L222 was not covered by tests
if isinstance(data, dict):
result = _resolve_codec(data)
if not isinstance(result, ArrayArrayCodec):
Expand Down
57 changes: 53 additions & 4 deletions tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from zarr.core.array import (
CompressorsLike,
FiltersLike,
SerializerLike,
_get_default_chunk_encoding_v2,
_get_default_chunk_encoding_v3,
_parse_chunk_encoding_v2,
Expand Down Expand Up @@ -1025,6 +1026,15 @@ async def test_no_filters_compressors(store: MemoryStore, dtype: str, empty_valu
ZstdCodec(level=3),
{"name": "zstd", "configuration": {"level": 3}},
({"name": "zstd", "configuration": {"level": 3}},),
"zstd",
("crc32c", "zstd"),
],
)
@pytest.mark.parametrize(
"serializer",
[
"auto",
"bytes",
],
)
@pytest.mark.parametrize(
Expand Down Expand Up @@ -1065,6 +1075,7 @@ async def test_no_filters_compressors(store: MemoryStore, dtype: str, empty_valu
async def test_v3_chunk_encoding(
store: MemoryStore,
compressors: CompressorsLike,
serializer: SerializerLike,
filters: FiltersLike,
dtype: str,
chunks: tuple[int, ...],
Expand All @@ -1073,6 +1084,9 @@ async def test_v3_chunk_encoding(
"""
Test various possibilities for the compressors and filters parameter to create_array
"""
if serializer == "bytes" and dtype == "str":
serializer = "vlen-utf8"

arr = await create_array(
store=store,
dtype=dtype,
Expand All @@ -1081,12 +1095,14 @@ async def test_v3_chunk_encoding(
shards=shards,
zarr_format=3,
filters=filters,
serializer=serializer,
compressors=compressors,
)
filters_expected, _, compressors_expected = _parse_chunk_encoding_v3(
filters=filters, compressors=compressors, serializer="auto", dtype=np.dtype(dtype)
filters_expected, serializer_expected, compressors_expected = _parse_chunk_encoding_v3(
filters=filters, compressors=compressors, serializer=serializer, dtype=np.dtype(dtype)
)
assert arr.filters == filters_expected
assert arr.serializer == serializer_expected
assert arr.compressors == compressors_expected

@staticmethod
Expand All @@ -1098,11 +1114,20 @@ async def test_v3_chunk_encoding(
None,
numcodecs.Zstd(level=3),
(),
(numcodecs.Zstd(level=3),),
(numcodecs.Zstd(level=2),),
"zstd",
],
)
@pytest.mark.parametrize(
"filters", ["auto", None, numcodecs.GZip(level=1), (numcodecs.GZip(level=1),)]
"filters",
[
"auto",
None,
numcodecs.GZip(level=1),
(numcodecs.GZip(level=2)),
"gzip",
("gzip", "zstd"),
],
)
async def test_v2_chunk_encoding(
store: MemoryStore, compressors: CompressorsLike, filters: FiltersLike, dtype: str
Expand All @@ -1129,6 +1154,30 @@ async def test_v2_chunk_encoding(
assert arr.compressors == compressor_expected
assert arr.filters == filters_expected

@staticmethod
async def test_invalid_chunk_encoding(store: MemoryStore) -> None:
"""
Test that passing an invalid compressor or filter to create_array raises an error.
"""
invalid_compressor_type = 2
msg = f"For Zarr format 2 arrays, the `compressor` must be a single codec. Expected None, a numcodecs.abc.Codec, or a dict or str representation of a numcodecs.abc.Codec. Got {type(invalid_compressor_type)} instead."
with pytest.raises(ValueError, match=msg):
await create_array(
store=store,
dtype="uint8",
shape=(10,),
zarr_format=2,
compressors=invalid_compressor_type,
)
with pytest.raises(KeyError):
await create_array(
store=store,
dtype="uint8",
shape=(10,),
zarr_format=3,
filters="nonexistent_filter_name",
)

@staticmethod
@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"])
async def test_default_filters_compressors(
Expand Down