diff --git a/.github/workflows/releases.yml b/.github/workflows/releases.yml index 51fcf08591..b54cbe48b3 100644 --- a/.github/workflows/releases.yml +++ b/.github/workflows/releases.yml @@ -55,7 +55,7 @@ jobs: with: name: releases path: dist - - uses: pypa/gh-action-pypi-publish@v1.8.14 + - uses: pypa/gh-action-pypi-publish@v1.9.0 with: user: __token__ password: ${{ secrets.pypi_password }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index de1adb8840..1ef226cd28 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,7 +7,7 @@ default_language_version: python: python3 repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: 'v0.4.8' + rev: 'v0.4.9' hooks: - id: ruff args: ["--fix", "--show-fixes"] diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 1f452159ed..1d7106e25a 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -2,12 +2,15 @@ from abc import abstractmethod from collections.abc import Awaitable, Callable, Iterable -from typing import TYPE_CHECKING, Generic, TypeVar +from typing import TYPE_CHECKING, Any, Generic, TypeVar + +import numpy as np from zarr.abc.metadata import Metadata from zarr.abc.store import ByteGetter, ByteSetter from zarr.buffer import Buffer, NDBuffer -from zarr.common import concurrent_map +from zarr.chunk_grids import ChunkGrid +from zarr.common import ChunkCoords, concurrent_map from zarr.config import config if TYPE_CHECKING: @@ -15,7 +18,6 @@ from zarr.array_spec import ArraySpec from zarr.indexing import SelectorTuple - from zarr.metadata import ArrayMetadata CodecInput = TypeVar("CodecInput", bound=NDBuffer | Buffer) CodecOutput = TypeVar("CodecOutput", bound=NDBuffer | Buffer) @@ -75,13 +77,18 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: """ return self - def validate(self, array_metadata: ArrayMetadata) -> None: + def validate(self, *, shape: ChunkCoords, dtype: np.dtype[Any], chunk_grid: ChunkGrid) -> None: """Validates that the codec configuration is compatible with the array metadata. Raises errors when the codec configuration is not compatible. Parameters ---------- - array_metadata : ArrayMetadata + shape: ChunkCoords + The array shape + dtype: np.dtype[Any] + The array data type + chunk_grid: ChunkGrid + The array chunk grid """ ... @@ -275,13 +282,18 @@ def supports_partial_decode(self) -> bool: ... def supports_partial_encode(self) -> bool: ... @abstractmethod - def validate(self, array_metadata: ArrayMetadata) -> None: + def validate(self, *, shape: ChunkCoords, dtype: np.dtype[Any], chunk_grid: ChunkGrid) -> None: """Validates that all codec configurations are compatible with the array metadata. Raises errors when a codec configuration is not compatible. Parameters ---------- - array_metadata : ArrayMetadata + shape: ChunkCoords + The array shape + dtype: np.dtype[Any] + The array data type + chunk_grid: ChunkGrid + The array chunk grid """ ... diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 52d07fb6fe..6cf7378bfa 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -322,7 +322,7 @@ async def tree(*args: Any, **kwargs: Any) -> None: raise NotImplementedError -async def array(data: NDArrayLike, **kwargs: Any) -> AsyncArray: +async def array(data: npt.ArrayLike, **kwargs: Any) -> AsyncArray: """Create an array filled with `data`. Parameters diff --git a/src/zarr/array.py b/src/zarr/array.py index 9ac1ce41ec..4318050dd5 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -11,26 +11,27 @@ # 1. Was splitting the array into two classes really necessary? from asyncio import gather from collections.abc import Iterable -from dataclasses import dataclass, replace +from dataclasses import dataclass, field, replace from typing import Any, Literal, cast import numpy as np import numpy.typing as npt -from zarr.abc.codec import Codec +from zarr.abc.codec import Codec, CodecPipeline from zarr.abc.store import set_or_delete from zarr.attributes import Attributes from zarr.buffer import BufferPrototype, NDArrayLike, NDBuffer, default_buffer_prototype from zarr.chunk_grids import RegularChunkGrid from zarr.chunk_key_encodings import ChunkKeyEncoding, DefaultChunkKeyEncoding, V2ChunkKeyEncoding from zarr.codecs import BytesCodec +from zarr.codecs._v2 import V2Compressor, V2Filters +from zarr.codecs.pipeline import BatchedCodecPipeline from zarr.common import ( JSON, ZARR_JSON, ZARRAY_JSON, ZATTRS_JSON, ChunkCoords, - Selection, ZarrFormat, concurrent_map, product, @@ -41,7 +42,6 @@ BasicSelection, BlockIndex, BlockIndexer, - BlockSelection, CoordinateIndexer, CoordinateSelection, Fields, @@ -51,6 +51,7 @@ OIndex, OrthogonalIndexer, OrthogonalSelection, + Selection, VIndex, check_fields, check_no_multi_fields, @@ -64,8 +65,8 @@ from zarr.sync import sync -def parse_array_metadata(data: Any) -> ArrayMetadata: - if isinstance(data, ArrayMetadata): +def parse_array_metadata(data: Any) -> ArrayV2Metadata | ArrayV3Metadata: + if isinstance(data, ArrayV2Metadata | ArrayV3Metadata): return data elif isinstance(data, dict): if data["zarr_format"] == 3: @@ -75,10 +76,22 @@ def parse_array_metadata(data: Any) -> ArrayMetadata: raise TypeError +def create_codec_pipeline(metadata: ArrayV2Metadata | ArrayV3Metadata) -> BatchedCodecPipeline: + if isinstance(metadata, ArrayV3Metadata): + return BatchedCodecPipeline.from_list(metadata.codecs) + elif isinstance(metadata, ArrayV2Metadata): + return BatchedCodecPipeline.from_list( + [V2Filters(metadata.filters or []), V2Compressor(metadata.compressor)] + ) + else: + raise AssertionError + + @dataclass(frozen=True) class AsyncArray: metadata: ArrayMetadata store_path: StorePath + codec_pipeline: CodecPipeline = field(init=False) order: Literal["C", "F"] def __init__( @@ -93,6 +106,7 @@ def __init__( object.__setattr__(self, "metadata", metadata_parsed) object.__setattr__(self, "store_path", store_path) object.__setattr__(self, "order", order_parsed) + object.__setattr__(self, "codec_pipeline", create_codec_pipeline(metadata=metadata_parsed)) @classmethod async def create( @@ -444,7 +458,7 @@ async def _get_selection( ) if product(indexer.shape) > 0: # reading chunks and decoding them - await self.metadata.codec_pipeline.read( + await self.codec_pipeline.read( [ ( self.store_path / self.metadata.encode_chunk_key(chunk_coords), @@ -460,7 +474,7 @@ async def _get_selection( return out_buffer.as_ndarray_like() async def getitem( - self, selection: Selection, *, prototype: BufferPrototype = default_buffer_prototype + self, selection: BasicSelection, *, prototype: BufferPrototype = default_buffer_prototype ) -> NDArrayLike: indexer = BasicIndexer( selection, @@ -477,7 +491,7 @@ async def _save_metadata(self, metadata: ArrayMetadata) -> None: async def _set_selection( self, indexer: Indexer, - value: NDArrayLike, + value: npt.ArrayLike, *, prototype: BufferPrototype, fields: Fields | None = None, @@ -488,23 +502,23 @@ async def _set_selection( # check value shape if np.isscalar(value): - value = np.asanyarray(value) + value = np.asanyarray(value, dtype=self.metadata.dtype) else: if not hasattr(value, "shape"): value = np.asarray(value, self.metadata.dtype) # assert ( # value.shape == indexer.shape # ), f"shape of value doesn't match indexer shape. Expected {indexer.shape}, got {value.shape}" - if value.dtype.name != self.metadata.dtype.name: - value = value.astype(self.metadata.dtype, order="A") - + if not hasattr(value, "dtype") or value.dtype.name != self.metadata.dtype.name: + value = np.array(value, dtype=self.metadata.dtype, order="A") + value = cast(NDArrayLike, value) # We accept any ndarray like object from the user and convert it # to a NDBuffer (or subclass). From this point onwards, we only pass # Buffer and NDBuffer between components. value_buffer = prototype.nd_buffer.from_ndarray_like(value) # merging with existing data and encoding chunks - await self.metadata.codec_pipeline.write( + await self.codec_pipeline.write( [ ( self.store_path / self.metadata.encode_chunk_key(chunk_coords), @@ -520,8 +534,8 @@ async def _set_selection( async def setitem( self, - selection: Selection, - value: NDArrayLike, + selection: BasicSelection, + value: npt.ArrayLike, prototype: BufferPrototype = default_buffer_prototype, ) -> None: indexer = BasicIndexer( @@ -701,7 +715,167 @@ def order(self) -> Literal["C", "F"]: def read_only(self) -> bool: return self._async_array.read_only + def __array__( + self, dtype: npt.DTypeLike | None = None, copy: bool | None = None + ) -> NDArrayLike: + """ + This method is used by numpy when converting zarr.Array into a numpy array. + For more information, see https://numpy.org/devdocs/user/basics.interoperability.html#the-array-method + """ + if copy is False: + msg = "`copy=False` is not supported. This method always creates a copy." + raise ValueError(msg) + + arr_np = self[...] + + if dtype is not None: + arr_np = arr_np.astype(dtype) + + return arr_np + def __getitem__(self, selection: Selection) -> NDArrayLike: + """Retrieve data for an item or region of the array. + + Parameters + ---------- + selection : tuple + An integer index or slice or tuple of int/slice objects specifying the + requested item or region for each dimension of the array. + + Returns + ------- + NDArrayLike + An array-like containing the data for the requested region. + + Examples + -------- + Setup a 1-dimensional array:: + + >>> import zarr + >>> import numpy as np + >>> data = np.arange(100, dtype="uint16") + >>> z = Array.create( + >>> StorePath(MemoryStore(mode="w")), + >>> shape=data.shape, + >>> chunk_shape=(10,), + >>> dtype=data.dtype, + >>> ) + >>> z[:] = data + + Retrieve a single item:: + + >>> z[5] + 5 + + Retrieve a region via slicing:: + + >>> z[:5] + array([0, 1, 2, 3, 4]) + >>> z[-5:] + array([95, 96, 97, 98, 99]) + >>> z[5:10] + array([5, 6, 7, 8, 9]) + >>> z[5:10:2] + array([5, 7, 9]) + >>> z[::2] + array([ 0, 2, 4, ..., 94, 96, 98]) + + Load the entire array into memory:: + + >>> z[...] + array([ 0, 1, 2, ..., 97, 98, 99]) + + Setup a 2-dimensional array:: + + >>> data = np.arange(100, dtype="uint16").reshape(10, 10) + >>> z = Array.create( + >>> StorePath(MemoryStore(mode="w")), + >>> shape=data.shape, + >>> chunk_shape=(10, 10), + >>> dtype=data.dtype, + >>> ) + >>> z[:] = data + + Retrieve an item:: + + >>> z[2, 2] + 22 + + Retrieve a region via slicing:: + + >>> z[1:3, 1:3] + array([[11, 12], + [21, 22]]) + >>> z[1:3, :] + array([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]]) + >>> z[:, 1:3] + array([[ 1, 2], + [11, 12], + [21, 22], + [31, 32], + [41, 42], + [51, 52], + [61, 62], + [71, 72], + [81, 82], + [91, 92]]) + >>> z[0:5:2, 0:5:2] + array([[ 0, 2, 4], + [20, 22, 24], + [40, 42, 44]]) + >>> z[::2, ::2] + array([[ 0, 2, 4, 6, 8], + [20, 22, 24, 26, 28], + [40, 42, 44, 46, 48], + [60, 62, 64, 66, 68], + [80, 82, 84, 86, 88]]) + + Load the entire array into memory:: + + >>> z[...] + array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + [20, 21, 22, 23, 24, 25, 26, 27, 28, 29], + [30, 31, 32, 33, 34, 35, 36, 37, 38, 39], + [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], + [50, 51, 52, 53, 54, 55, 56, 57, 58, 59], + [60, 61, 62, 63, 64, 65, 66, 67, 68, 69], + [70, 71, 72, 73, 74, 75, 76, 77, 78, 79], + [80, 81, 82, 83, 84, 85, 86, 87, 88, 89], + [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]]) + + Notes + ----- + Slices with step > 1 are supported, but slices with negative step are not. + + For arrays with a structured dtype, see zarr v2 for examples of how to use + fields + + Currently the implementation for __getitem__ is provided by + :func:`vindex` if the indexing is pure fancy indexing (ie a + broadcast-compatible tuple of integer array indices), or by + :func:`set_basic_selection` otherwise. + + Effectively, this means that the following indexing modes are supported: + + - integer indexing + - slice indexing + - mixed slice and integer indexing + - boolean indexing + - fancy indexing (vectorized list of integers) + + For specific indexing options including outer indexing, see the + methods listed under See Also. + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, + get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, + set_orthogonal_selection, get_block_selection, set_block_selection, + vindex, oindex, blocks, __setitem__ + + """ fields, pure_selection = pop_fields(selection) if is_pure_fancy_indexing(pure_selection, self.ndim): return self.vindex[cast(CoordinateSelection | MaskSelection, selection)] @@ -710,7 +884,97 @@ def __getitem__(self, selection: Selection) -> NDArrayLike: else: return self.get_basic_selection(cast(BasicSelection, pure_selection), fields=fields) - def __setitem__(self, selection: Selection, value: NDArrayLike) -> None: + def __setitem__(self, selection: Selection, value: npt.ArrayLike) -> None: + """Modify data for an item or region of the array. + + Parameters + ---------- + selection : tuple + An integer index or slice or tuple of int/slice specifying the requested + region for each dimension of the array. + value : npt.ArrayLike + An array-like containing the data to be stored in the selection. + + Examples + -------- + Setup a 1-dimensional array:: + + >>> import zarr + >>> z = zarr.zeros( + >>> shape=(100,), + >>> store=StorePath(MemoryStore(mode="w")), + >>> chunk_shape=(5,), + >>> dtype="i4", + >>> ) + + Set all array elements to the same scalar value:: + + >>> z[...] = 42 + >>> z[...] + array([42, 42, 42, ..., 42, 42, 42]) + + Set a portion of the array:: + + >>> z[:10] = np.arange(10) + >>> z[-10:] = np.arange(10)[::-1] + >>> z[...] + array([ 0, 1, 2, ..., 2, 1, 0]) + + Setup a 2-dimensional array:: + + >>> z = zarr.zeros( + >>> shape=(5, 5), + >>> store=StorePath(MemoryStore(mode="w")), + >>> chunk_shape=(5, 5), + >>> dtype="i4", + >>> ) + + Set all array elements to the same scalar value:: + + >>> z[...] = 42 + + Set a portion of the array:: + + >>> z[0, :] = np.arange(z.shape[1]) + >>> z[:, 0] = np.arange(z.shape[0]) + >>> z[...] + array([[ 0, 1, 2, 3, 4], + [ 1, 42, 42, 42, 42], + [ 2, 42, 42, 42, 42], + [ 3, 42, 42, 42, 42], + [ 4, 42, 42, 42, 42]]) + + Notes + ----- + Slices with step > 1 are supported, but slices with negative step are not. + + For arrays with a structured dtype, see zarr v2 for examples of how to use + fields + + Currently the implementation for __setitem__ is provided by + :func:`vindex` if the indexing is pure fancy indexing (ie a + broadcast-compatible tuple of integer array indices), or by + :func:`set_basic_selection` otherwise. + + Effectively, this means that the following indexing modes are supported: + + - integer indexing + - slice indexing + - mixed slice and integer indexing + - boolean indexing + - fancy indexing (vectorized list of integers) + + For specific indexing options including outer indexing, see the + methods listed under See Also. + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, + get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, + set_orthogonal_selection, get_block_selection, set_block_selection, + vindex, oindex, blocks, __getitem__ + + """ fields, pure_selection = pop_fields(selection) if is_pure_fancy_indexing(pure_selection, self.ndim): self.vindex[cast(CoordinateSelection | MaskSelection, selection)] = value @@ -727,26 +991,209 @@ def get_basic_selection( prototype: BufferPrototype = default_buffer_prototype, fields: Fields | None = None, ) -> NDArrayLike: - if self.shape == (): - raise NotImplementedError - else: - return sync( - self._async_array._get_selection( - BasicIndexer(selection, self.shape, self.metadata.chunk_grid), - out=out, - fields=fields, - prototype=prototype, - ) + """Retrieve data for an item or region of the array. + + Parameters + ---------- + selection : tuple + A tuple specifying the requested item or region for each dimension of the + array. May be any combination of int and/or slice or ellipsis for multidimensional arrays. + out : NDBuffer, optional + If given, load the selected data directly into this buffer. + fields : str or sequence of str, optional + For arrays with a structured dtype, one or more fields can be specified to + extract data for. + prototype : BufferPrototype, optional + The prototype of the buffer to use for the output data. If not provided, the default buffer prototype is used. + + Returns + ------- + NDArrayLike + An array-like containing the data for the requested region. + + Examples + -------- + Setup a 1-dimensional array:: + + >>> import zarr + >>> import numpy as np + >>> data = np.arange(100, dtype="uint16") + >>> z = Array.create( + >>> StorePath(MemoryStore(mode="w")), + >>> shape=data.shape, + >>> chunk_shape=(3,), + >>> dtype=data.dtype, + >>> ) + >>> z[:] = data + + Retrieve a single item:: + + >>> z.get_basic_selection(5) + 5 + + Retrieve a region via slicing:: + + >>> z.get_basic_selection(slice(5)) + array([0, 1, 2, 3, 4]) + >>> z.get_basic_selection(slice(-5, None)) + array([95, 96, 97, 98, 99]) + >>> z.get_basic_selection(slice(5, 10)) + array([5, 6, 7, 8, 9]) + >>> z.get_basic_selection(slice(5, 10, 2)) + array([5, 7, 9]) + >>> z.get_basic_selection(slice(None, None, 2)) + array([ 0, 2, 4, ..., 94, 96, 98]) + + Setup a 3-dimensional array:: + + >>> data = np.arange(1000).reshape(10, 10, 10) + >>> z = Array.create( + >>> StorePath(MemoryStore(mode="w")), + >>> shape=data.shape, + >>> chunk_shape=(5, 5, 5), + >>> dtype=data.dtype, + >>> ) + >>> z[:] = data + + Retrieve an item:: + + >>> z.get_basic_selection((1, 2, 3)) + 123 + + Retrieve a region via slicing and Ellipsis:: + + >>> z.get_basic_selection((slice(1, 3), slice(1, 3), 0)) + array([[110, 120], + [210, 220]]) + >>> z.get_basic_selection(0, (slice(1, 3), slice(None))) + array([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]]) + >>> z.get_basic_selection((..., 5)) + array([[ 2 12 22 32 42 52 62 72 82 92] + [102 112 122 132 142 152 162 172 182 192] + ... + [802 812 822 832 842 852 862 872 882 892] + [902 912 922 932 942 952 962 972 982 992]] + + Notes + ----- + Slices with step > 1 are supported, but slices with negative step are not. + + For arrays with a structured dtype, see zarr v2 for examples of how to use + the `fields` parameter. + + This method provides the implementation for accessing data via the + square bracket notation (__getitem__). See :func:`__getitem__` for examples + using the alternative notation. + + See Also + -------- + set_basic_selection, get_mask_selection, set_mask_selection, + get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, + set_orthogonal_selection, get_block_selection, set_block_selection, + vindex, oindex, blocks, __getitem__, __setitem__ + + """ + + return sync( + self._async_array._get_selection( + BasicIndexer(selection, self.shape, self.metadata.chunk_grid), + out=out, + fields=fields, + prototype=prototype, ) + ) def set_basic_selection( self, selection: BasicSelection, - value: NDArrayLike, + value: npt.ArrayLike, *, fields: Fields | None = None, prototype: BufferPrototype = default_buffer_prototype, ) -> None: + """Modify data for an item or region of the array. + + Parameters + ---------- + selection : tuple + A tuple specifying the requested item or region for each dimension of the + array. May be any combination of int and/or slice or ellipsis for multidimensional arrays. + value : npt.ArrayLike + An array-like containing values to be stored into the array. + fields : str or sequence of str, optional + For arrays with a structured dtype, one or more fields can be specified to set + data for. + prototype : BufferPrototype, optional + The prototype of the buffer used for setting the data. If not provided, the + default buffer prototype is used. + + Examples + -------- + Setup a 1-dimensional array:: + + >>> import zarr + >>> z = zarr.zeros( + >>> shape=(100,), + >>> store=StorePath(MemoryStore(mode="w")), + >>> chunk_shape=(100,), + >>> dtype="i4", + >>> ) + + Set all array elements to the same scalar value:: + + >>> z.set_basic_selection(..., 42) + >>> z[...] + array([42, 42, 42, ..., 42, 42, 42]) + + Set a portion of the array:: + + >>> z.set_basic_selection(slice(10), np.arange(10)) + >>> z.set_basic_selection(slice(-10, None), np.arange(10)[::-1]) + >>> z[...] + array([ 0, 1, 2, ..., 2, 1, 0]) + + Setup a 2-dimensional array:: + + >>> z = zarr.zeros( + >>> shape=(5, 5), + >>> store=StorePath(MemoryStore(mode="w")), + >>> chunk_shape=(5, 5), + >>> dtype="i4", + >>> ) + + Set all array elements to the same scalar value:: + + >>> z.set_basic_selection(..., 42) + + Set a portion of the array:: + + >>> z.set_basic_selection((0, slice(None)), np.arange(z.shape[1])) + >>> z.set_basic_selection((slice(None), 0), np.arange(z.shape[0])) + >>> z[...] + array([[ 0, 1, 2, 3, 4], + [ 1, 42, 42, 42, 42], + [ 2, 42, 42, 42, 42], + [ 3, 42, 42, 42, 42], + [ 4, 42, 42, 42, 42]]) + + Notes + ----- + For arrays with a structured dtype, see zarr v2 for examples of how to use + the `fields` parameter. + + This method provides the underlying implementation for modifying data via square + bracket notation, see :func:`__setitem__` for equivalent examples using the + alternative notation. + + See Also + -------- + get_basic_selection, get_mask_selection, set_mask_selection, + get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, + set_orthogonal_selection, get_block_selection, set_block_selection, + vindex, oindex, blocks, __getitem__, __setitem__ + + """ indexer = BasicIndexer(selection, self.shape, self.metadata.chunk_grid) sync(self._async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) @@ -758,6 +1205,113 @@ def get_orthogonal_selection( fields: Fields | None = None, prototype: BufferPrototype = default_buffer_prototype, ) -> NDArrayLike: + """Retrieve data by making a selection for each dimension of the array. For + example, if an array has 2 dimensions, allows selecting specific rows and/or + columns. The selection for each dimension can be either an integer (indexing a + single item), a slice, an array of integers, or a Boolean array where True + values indicate a selection. + + Parameters + ---------- + selection : tuple + A selection for each dimension of the array. May be any combination of int, + slice, integer array or Boolean array. + out : NDBuffer, optional + If given, load the selected data directly into this buffer. + fields : str or sequence of str, optional + For arrays with a structured dtype, one or more fields can be specified to + extract data for. + prototype : BufferPrototype, optional + The prototype of the buffer to use for the output data. If not provided, the default buffer prototype is used. + + Returns + ------- + NDArrayLike + An array-like containing the data for the requested selection. + + Examples + -------- + Setup a 2-dimensional array:: + + >>> import zarr + >>> import numpy as np + >>> data = np.arange(100).reshape(10, 10) + >>> z = Array.create( + >>> StorePath(MemoryStore(mode="w")), + >>> shape=data.shape, + >>> chunk_shape=data.shape, + >>> dtype=data.dtype, + >>> ) + >>> z[:] = data + + Retrieve rows and columns via any combination of int, slice, integer array and/or + Boolean array:: + + >>> z.get_orthogonal_selection(([1, 4], slice(None))) + array([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + [40, 41, 42, 43, 44, 45, 46, 47, 48, 49]]) + >>> z.get_orthogonal_selection((slice(None), [1, 4])) + array([[ 1, 4], + [11, 14], + [21, 24], + [31, 34], + [41, 44], + [51, 54], + [61, 64], + [71, 74], + [81, 84], + [91, 94]]) + >>> z.get_orthogonal_selection(([1, 4], [1, 4])) + array([[11, 14], + [41, 44]]) + >>> sel = np.zeros(z.shape[0], dtype=bool) + >>> sel[1] = True + >>> sel[4] = True + >>> z.get_orthogonal_selection((sel, sel)) + array([[11, 14], + [41, 44]]) + + For convenience, the orthogonal selection functionality is also available via the + `oindex` property, e.g.:: + + >>> z.oindex[[1, 4], :] + array([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + [40, 41, 42, 43, 44, 45, 46, 47, 48, 49]]) + >>> z.oindex[:, [1, 4]] + array([[ 1, 4], + [11, 14], + [21, 24], + [31, 34], + [41, 44], + [51, 54], + [61, 64], + [71, 74], + [81, 84], + [91, 94]]) + >>> z.oindex[[1, 4], [1, 4]] + array([[11, 14], + [41, 44]]) + >>> sel = np.zeros(z.shape[0], dtype=bool) + >>> sel[1] = True + >>> sel[4] = True + >>> z.oindex[sel, sel] + array([[11, 14], + [41, 44]]) + + Notes + ----- + Orthogonal indexing is also known as outer indexing. + + Slices with step > 1 are supported, but slices with negative step are not. + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, + get_coordinate_selection, set_coordinate_selection, set_orthogonal_selection, + get_block_selection, set_block_selection, + vindex, oindex, blocks, __getitem__, __setitem__ + + """ indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) return sync( self._async_array._get_selection( @@ -768,11 +1322,106 @@ def get_orthogonal_selection( def set_orthogonal_selection( self, selection: OrthogonalSelection, - value: NDArrayLike, + value: npt.ArrayLike, *, fields: Fields | None = None, prototype: BufferPrototype = default_buffer_prototype, ) -> None: + """Modify data via a selection for each dimension of the array. + + Parameters + ---------- + selection : tuple + A selection for each dimension of the array. May be any combination of int, + slice, integer array or Boolean array. + value : npt.ArrayLike + An array-like array containing the data to be stored in the array. + fields : str or sequence of str, optional + For arrays with a structured dtype, one or more fields can be specified to set + data for. + prototype : BufferPrototype, optional + The prototype of the buffer used for setting the data. If not provided, the + default buffer prototype is used. + + Examples + -------- + Setup a 2-dimensional array:: + + >>> import zarr + >>> z = zarr.zeros( + >>> shape=(5, 5), + >>> store=StorePath(MemoryStore(mode="w")), + >>> chunk_shape=(5, 5), + >>> dtype="i4", + >>> ) + + + Set data for a selection of rows:: + + >>> z.set_orthogonal_selection(([1, 4], slice(None)), 1) + >>> z[...] + array([[0, 0, 0, 0, 0], + [1, 1, 1, 1, 1], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [1, 1, 1, 1, 1]]) + + Set data for a selection of columns:: + + >>> z.set_orthogonal_selection((slice(None), [1, 4]), 2) + >>> z[...] + array([[0, 2, 0, 0, 2], + [1, 2, 1, 1, 2], + [0, 2, 0, 0, 2], + [0, 2, 0, 0, 2], + [1, 2, 1, 1, 2]]) + + Set data for a selection of rows and columns:: + + >>> z.set_orthogonal_selection(([1, 4], [1, 4]), 3) + >>> z[...] + array([[0, 2, 0, 0, 2], + [1, 3, 1, 1, 3], + [0, 2, 0, 0, 2], + [0, 2, 0, 0, 2], + [1, 3, 1, 1, 3]]) + + Set data from a 2D array:: + + >>> values = np.arange(10).reshape(2, 5) + >>> z.set_orthogonal_selection(([0, 3], ...), values) + >>> z[...] + array([[0, 1, 2, 3, 4], + [1, 3, 1, 1, 3], + [0, 2, 0, 0, 2], + [5, 6, 7, 8, 9], + [1, 3, 1, 1, 3]]) + + For convenience, this functionality is also available via the `oindex` property. + E.g.:: + + >>> z.oindex[[1, 4], [1, 4]] = 4 + >>> z[...] + array([[0, 1, 2, 3, 4], + [1, 4, 1, 1, 4], + [0, 2, 0, 0, 2], + [5, 6, 7, 8, 9], + [1, 4, 1, 1, 4]]) + + Notes + ----- + Orthogonal indexing is also known as outer indexing. + + Slices with step > 1 are supported, but slices with negative step are not. + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, + get_coordinate_selection, set_coordinate_selection, get_orthogonal_selection, + get_block_selection, set_block_selection, + vindex, oindex, blocks, __getitem__, __setitem__ + + """ indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) return sync( self._async_array._set_selection(indexer, value, fields=fields, prototype=prototype) @@ -786,6 +1435,71 @@ def get_mask_selection( fields: Fields | None = None, prototype: BufferPrototype = default_buffer_prototype, ) -> NDArrayLike: + """Retrieve a selection of individual items, by providing a Boolean array of the + same shape as the array against which the selection is being made, where True + values indicate a selected item. + + Parameters + ---------- + selection : ndarray, bool + A Boolean array of the same shape as the array against which the selection is + being made. + out : NDBuffer, optional + If given, load the selected data directly into this buffer. + fields : str or sequence of str, optional + For arrays with a structured dtype, one or more fields can be specified to + extract data for. + prototype : BufferPrototype, optional + The prototype of the buffer to use for the output data. If not provided, the default buffer prototype is used. + + Returns + ------- + NDArrayLike + An array-like containing the data for the requested selection. + + Examples + -------- + Setup a 2-dimensional array:: + + >>> import zarr + >>> import numpy as np + >>> data = np.arange(100).reshape(10, 10) + >>> z = Array.create( + >>> StorePath(MemoryStore(mode="w")), + >>> shape=data.shape, + >>> chunk_shape=data.shape, + >>> dtype=data.dtype, + >>> ) + >>> z[:] = data + + Retrieve items by specifying a mask:: + + >>> sel = np.zeros_like(z, dtype=bool) + >>> sel[1, 1] = True + >>> sel[4, 4] = True + >>> z.get_mask_selection(sel) + array([11, 44]) + + For convenience, the mask selection functionality is also available via the + `vindex` property, e.g.:: + + >>> z.vindex[sel] + array([11, 44]) + + Notes + ----- + Mask indexing is a form of vectorized or inner indexing, and is equivalent to + coordinate indexing. Internally the mask array is converted to coordinate + arrays by calling `np.nonzero`. + + See Also + -------- + get_basic_selection, set_basic_selection, set_mask_selection, + get_orthogonal_selection, set_orthogonal_selection, get_coordinate_selection, + set_coordinate_selection, get_block_selection, set_block_selection, + vindex, oindex, blocks, __getitem__, __setitem__ + """ + indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) return sync( self._async_array._get_selection( @@ -796,11 +1510,76 @@ def get_mask_selection( def set_mask_selection( self, mask: MaskSelection, - value: NDArrayLike, + value: npt.ArrayLike, *, fields: Fields | None = None, prototype: BufferPrototype = default_buffer_prototype, ) -> None: + """Modify a selection of individual items, by providing a Boolean array of the + same shape as the array against which the selection is being made, where True + values indicate a selected item. + + Parameters + ---------- + selection : ndarray, bool + A Boolean array of the same shape as the array against which the selection is + being made. + value : npt.ArrayLike + An array-like containing values to be stored into the array. + fields : str or sequence of str, optional + For arrays with a structured dtype, one or more fields can be specified to set + data for. + + Examples + -------- + Setup a 2-dimensional array:: + + >>> import zarr + >>> z = zarr.zeros( + >>> shape=(5, 5), + >>> store=StorePath(MemoryStore(mode="w")), + >>> chunk_shape=(5, 5), + >>> dtype="i4", + >>> ) + + Set data for a selection of items:: + + >>> sel = np.zeros_like(z, dtype=bool) + >>> sel[1, 1] = True + >>> sel[4, 4] = True + >>> z.set_mask_selection(sel, 1) + >>> z[...] + array([[0, 0, 0, 0, 0], + [0, 1, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 1]]) + + For convenience, this functionality is also available via the `vindex` property. + E.g.:: + + >>> z.vindex[sel] = 2 + >>> z[...] + array([[0, 0, 0, 0, 0], + [0, 2, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 2]]) + + Notes + ----- + Mask indexing is a form of vectorized or inner indexing, and is equivalent to + coordinate indexing. Internally the mask array is converted to coordinate + arrays by calling `np.nonzero`. + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, + get_orthogonal_selection, set_orthogonal_selection, get_coordinate_selection, + set_coordinate_selection, get_block_selection, set_block_selection, + vindex, oindex, blocks, __getitem__, __setitem__ + + """ indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) sync(self._async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) @@ -812,6 +1591,73 @@ def get_coordinate_selection( fields: Fields | None = None, prototype: BufferPrototype = default_buffer_prototype, ) -> NDArrayLike: + """Retrieve a selection of individual items, by providing the indices + (coordinates) for each selected item. + + Parameters + ---------- + selection : tuple + An integer (coordinate) array for each dimension of the array. + out : NDBuffer, optional + If given, load the selected data directly into this buffer. + fields : str or sequence of str, optional + For arrays with a structured dtype, one or more fields can be specified to + extract data for. + prototype : BufferPrototype, optional + The prototype of the buffer to use for the output data. If not provided, the default buffer prototype is used. + + Returns + ------- + NDArrayLike + An array-like containing the data for the requested coordinate selection. + + Examples + -------- + Setup a 2-dimensional array:: + + >>> import zarr + >>> import numpy as np + >>> data = np.arange(0, 100, dtype="uint16").reshape((10, 10)) + >>> z = Array.create( + >>> StorePath(MemoryStore(mode="w")), + >>> shape=data.shape, + >>> chunk_shape=(3, 3), + >>> dtype=data.dtype, + >>> ) + >>> z[:] = data + + Retrieve items by specifying their coordinates:: + + >>> z.get_coordinate_selection(([1, 4], [1, 4])) + array([11, 44]) + + For convenience, the coordinate selection functionality is also available via the + `vindex` property, e.g.:: + + >>> z.vindex[[1, 4], [1, 4]] + array([11, 44]) + + Notes + ----- + Coordinate indexing is also known as point selection, and is a form of vectorized + or inner indexing. + + Slices are not supported. Coordinate arrays must be provided for all dimensions + of the array. + + Coordinate arrays may be multidimensional, in which case the output array will + also be multidimensional. Coordinate arrays are broadcast against each other + before being applied. The shape of the output will be the same as the shape of + each coordinate array after broadcasting. + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, + get_orthogonal_selection, set_orthogonal_selection, set_coordinate_selection, + get_block_selection, set_block_selection, + vindex, oindex, blocks, __getitem__, __setitem__ + + """ indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) out_array = sync( self._async_array._get_selection( @@ -819,18 +1665,81 @@ def get_coordinate_selection( ) ) - # restore shape - out_array = out_array.reshape(indexer.sel_shape) + if hasattr(out_array, "shape"): + # restore shape + out_array = np.array(out_array).reshape(indexer.sel_shape) return out_array def set_coordinate_selection( self, selection: CoordinateSelection, - value: NDArrayLike, + value: npt.ArrayLike, *, fields: Fields | None = None, prototype: BufferPrototype = default_buffer_prototype, ) -> None: + """Modify a selection of individual items, by providing the indices (coordinates) + for each item to be modified. + + Parameters + ---------- + selection : tuple + An integer (coordinate) array for each dimension of the array. + value : npt.ArrayLike + An array-like containing values to be stored into the array. + fields : str or sequence of str, optional + For arrays with a structured dtype, one or more fields can be specified to set + data for. + + Examples + -------- + Setup a 2-dimensional array:: + + >>> import zarr + >>> z = zarr.zeros( + >>> shape=(5, 5), + >>> store=StorePath(MemoryStore(mode="w")), + >>> chunk_shape=(5, 5), + >>> dtype="i4", + >>> ) + + Set data for a selection of items:: + + >>> z.set_coordinate_selection(([1, 4], [1, 4]), 1) + >>> z[...] + array([[0, 0, 0, 0, 0], + [0, 1, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 1]]) + + For convenience, this functionality is also available via the `vindex` property. + E.g.:: + + >>> z.vindex[[1, 4], [1, 4]] = 2 + >>> z[...] + array([[0, 0, 0, 0, 0], + [0, 2, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 2]]) + + Notes + ----- + Coordinate indexing is also known as point selection, and is a form of vectorized + or inner indexing. + + Slices are not supported. Coordinate arrays must be provided for all dimensions + of the array. + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, + get_orthogonal_selection, set_orthogonal_selection, get_coordinate_selection, + get_block_selection, set_block_selection, + vindex, oindex, blocks, __getitem__, __setitem__ + + """ # setup indexer indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) @@ -844,18 +1753,99 @@ def set_coordinate_selection( # Handle types like `list` or `tuple` value = np.array(value) # TODO replace with agnostic if hasattr(value, "shape") and len(value.shape) > 1: - value = value.reshape(-1) + value = np.array(value).reshape(-1) sync(self._async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) def get_block_selection( self, - selection: BlockSelection, + selection: BasicSelection, *, out: NDBuffer | None = None, fields: Fields | None = None, prototype: BufferPrototype = default_buffer_prototype, ) -> NDArrayLike: + """Retrieve a selection of individual items, by providing the indices + (coordinates) for each selected item. + + Parameters + ---------- + selection : int or slice or tuple of int or slice + An integer (coordinate) or slice for each dimension of the array. + out : NDBuffer, optional + If given, load the selected data directly into this buffer. + fields : str or sequence of str, optional + For arrays with a structured dtype, one or more fields can be specified to + extract data for. + prototype : BufferPrototype, optional + The prototype of the buffer to use for the output data. If not provided, the default buffer prototype is used. + + Returns + ------- + NDArrayLike + An array-like containing the data for the requested block selection. + + Examples + -------- + Setup a 2-dimensional array:: + + >>> import zarr + >>> import numpy as np + >>> data = np.arange(0, 100, dtype="uint16").reshape((10, 10)) + >>> z = Array.create( + >>> StorePath(MemoryStore(mode="w")), + >>> shape=data.shape, + >>> chunk_shape=(3, 3), + >>> dtype=data.dtype, + >>> ) + >>> z[:] = data + + Retrieve items by specifying their block coordinates:: + + >>> z.get_block_selection((1, slice(None))) + array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39], + [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], + [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]]) + + Which is equivalent to:: + + >>> z[3:6, :] + array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39], + [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], + [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]]) + + For convenience, the block selection functionality is also available via the + `blocks` property, e.g.:: + + >>> z.blocks[1] + array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39], + [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], + [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]]) + + Notes + ----- + Block indexing is a convenience indexing method to work on individual chunks + with chunk index slicing. It has the same concept as Dask's `Array.blocks` + indexing. + + Slices are supported. However, only with a step size of one. + + Block index arrays may be multidimensional to index multidimensional arrays. + For example:: + + >>> z.blocks[0, 1:3] + array([[ 3, 4, 5, 6, 7, 8], + [13, 14, 15, 16, 17, 18], + [23, 24, 25, 26, 27, 28]]) + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, + get_orthogonal_selection, set_orthogonal_selection, get_coordinate_selection, + set_coordinate_selection, set_block_selection, + vindex, oindex, blocks, __getitem__, __setitem__ + + """ indexer = BlockIndexer(selection, self.shape, self.metadata.chunk_grid) return sync( self._async_array._get_selection( @@ -865,28 +1855,147 @@ def get_block_selection( def set_block_selection( self, - selection: BlockSelection, - value: NDArrayLike, + selection: BasicSelection, + value: npt.ArrayLike, *, fields: Fields | None = None, prototype: BufferPrototype = default_buffer_prototype, ) -> None: + """Modify a selection of individual blocks, by providing the chunk indices + (coordinates) for each block to be modified. + + Parameters + ---------- + selection : tuple + An integer (coordinate) or slice for each dimension of the array. + value : npt.ArrayLike + An array-like containing the data to be stored in the block selection. + fields : str or sequence of str, optional + For arrays with a structured dtype, one or more fields can be specified to set + data for. + prototype : BufferPrototype, optional + The prototype of the buffer used for setting the data. If not provided, the + default buffer prototype is used. + + Examples + -------- + Set up a 2-dimensional array:: + + >>> import zarr + >>> z = zarr.zeros( + >>> shape=(6, 6), + >>> store=StorePath(MemoryStore(mode="w")), + >>> chunk_shape=(2, 2), + >>> dtype="i4", + >>> ) + + Set data for a selection of items:: + + >>> z.set_block_selection((1, 0), 1) + >>> z[...] + array([[0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0]]) + + For convenience, this functionality is also available via the `blocks` property. + E.g.:: + + >>> z.blocks[2, 1] = 4 + >>> z[...] + array([[0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [0, 0, 4, 4, 0, 0], + [0, 0, 4, 4, 0, 0]]) + + >>> z.blocks[:, 2] = 7 + >>> z[...] + array([[0, 0, 0, 0, 7, 7], + [0, 0, 0, 0, 7, 7], + [1, 1, 0, 0, 7, 7], + [1, 1, 0, 0, 7, 7], + [0, 0, 4, 4, 7, 7], + [0, 0, 4, 4, 7, 7]]) + + Notes + ----- + Block indexing is a convenience indexing method to work on individual chunks + with chunk index slicing. It has the same concept as Dask's `Array.blocks` + indexing. + + Slices are supported. However, only with a step size of one. + + See Also + -------- + get_basic_selection, set_basic_selection, get_mask_selection, set_mask_selection, + get_orthogonal_selection, set_orthogonal_selection, get_coordinate_selection, + get_block_selection, set_block_selection, + vindex, oindex, blocks, __getitem__, __setitem__ + + """ indexer = BlockIndexer(selection, self.shape, self.metadata.chunk_grid) sync(self._async_array._set_selection(indexer, value, fields=fields, prototype=prototype)) @property def vindex(self) -> VIndex: + """Shortcut for vectorized (inner) indexing, see :func:`get_coordinate_selection`, + :func:`set_coordinate_selection`, :func:`get_mask_selection` and + :func:`set_mask_selection` for documentation and examples.""" return VIndex(self) @property def oindex(self) -> OIndex: + """Shortcut for orthogonal (outer) indexing, see :func:`get_orthogonal_selection` and + :func:`set_orthogonal_selection` for documentation and examples.""" return OIndex(self) @property def blocks(self) -> BlockIndex: + """Shortcut for blocked chunked indexing, see :func:`get_block_selection` and + :func:`set_block_selection` for documentation and examples.""" return BlockIndex(self) def resize(self, new_shape: ChunkCoords) -> Array: + """ + Change the shape of the array by growing or shrinking one or more + dimensions. + + This method does not modify the original Array object. Instead, it returns a new Array + with the specified shape. + + Examples + -------- + >>> import zarr + >>> z = zarr.zeros(shape=(10000, 10000), + >>> chunk_shape=(1000, 1000), + >>> store=StorePath(MemoryStore(mode="w")), + >>> dtype="i4",) + >>> z.shape + (10000, 10000) + >>> z = z.resize(20000, 1000) + >>> z.shape + (20000, 1000) + >>> z2 = z.resize(50, 50) + >>> z.shape + (20000, 1000) + >>> z2.shape + (50, 50) + + Notes + ----- + When resizing an array, the data are not rearranged in any way. + + If one or more dimensions are shrunk, any chunks falling outside the + new array shape will be deleted from the underlying store. + However, it is noteworthy that the chunks partially falling inside the new array + (i.e. boundary chunks) will remain intact, and therefore, + the data falling outside the new array but inside the boundary chunks + would be restored by a subsequent resize operation that grows the array size. + """ return type(self)( sync( self._async_array.resize(new_shape), diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 1a34d9f290..44691ea352 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -64,6 +64,8 @@ def __getitem__(self, key: slice) -> Self: ... def __setitem__(self, key: slice, value: Any) -> None: ... + def __array__(self) -> npt.NDArray[Any]: ... + def reshape( self, shape: ChunkCoords | Literal[-1], *, order: Literal["A", "C", "F"] = ... ) -> Self: ... @@ -232,7 +234,7 @@ def __add__(self, other: Buffer) -> Self: class NDBuffer: - """A n-dimensional memory block + """An n-dimensional memory block We use NDBuffer throughout Zarr to represent a n-dimensional memory block. diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index e577d18fb2..df1976d4c1 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -125,17 +125,14 @@ def to_dict(self) -> dict[str, JSON]: } def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: + dtype = array_spec.dtype new_codec = self if new_codec.typesize is None: - new_codec = replace(new_codec, typesize=array_spec.dtype.itemsize) + new_codec = replace(new_codec, typesize=dtype.itemsize) if new_codec.shuffle is None: new_codec = replace( new_codec, - shuffle=( - BloscShuffle.bitshuffle - if array_spec.dtype.itemsize == 1 - else BloscShuffle.shuffle - ), + shuffle=(BloscShuffle.bitshuffle if dtype.itemsize == 1 else BloscShuffle.shuffle), ) return new_codec diff --git a/src/zarr/codecs/pipeline.py b/src/zarr/codecs/pipeline.py index acef311a8c..a7f47661b8 100644 --- a/src/zarr/codecs/pipeline.py +++ b/src/zarr/codecs/pipeline.py @@ -2,10 +2,12 @@ from collections.abc import Iterable, Iterator from dataclasses import dataclass -from itertools import islice -from typing import TYPE_CHECKING, TypeVar +from itertools import islice, pairwise +from typing import TYPE_CHECKING, Any, TypeVar from warnings import warn +import numpy as np + from zarr.abc.codec import ( ArrayArrayCodec, ArrayBytesCodec, @@ -17,11 +19,11 @@ ) from zarr.abc.store import ByteGetter, ByteSetter from zarr.buffer import Buffer, BufferPrototype, NDBuffer +from zarr.chunk_grids import ChunkGrid from zarr.codecs.registry import get_codec_class -from zarr.common import JSON, concurrent_map, parse_named_configuration +from zarr.common import JSON, ChunkCoords, concurrent_map, parse_named_configuration from zarr.config import config from zarr.indexing import SelectorTuple, is_scalar, is_total_slice -from zarr.metadata import ArrayMetadata if TYPE_CHECKING: from typing_extensions import Self @@ -87,54 +89,11 @@ def to_dict(self) -> JSON: return [c.to_dict() for c in self] def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: - return type(self).from_list([c.evolve_from_array_spec(array_spec) for c in self]) - - @staticmethod - def codecs_from_list( - codecs: list[Codec], - ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: - from zarr.codecs.sharding import ShardingCodec - - if not any(isinstance(codec, ArrayBytesCodec) for codec in codecs): - raise ValueError("Exactly one array-to-bytes codec is required.") - - prev_codec: Codec | None = None - for codec in codecs: - if prev_codec is not None: - if isinstance(codec, ArrayBytesCodec) and isinstance(prev_codec, ArrayBytesCodec): - raise ValueError( - f"ArrayBytesCodec '{type(codec)}' cannot follow after ArrayBytesCodec '{type(prev_codec)}' because exactly 1 ArrayBytesCodec is allowed." - ) - if isinstance(codec, ArrayBytesCodec) and isinstance(prev_codec, BytesBytesCodec): - raise ValueError( - f"ArrayBytesCodec '{type(codec)}' cannot follow after BytesBytesCodec '{type(prev_codec)}'." - ) - if isinstance(codec, ArrayArrayCodec) and isinstance(prev_codec, ArrayBytesCodec): - raise ValueError( - f"ArrayArrayCodec '{type(codec)}' cannot follow after ArrayBytesCodec '{type(prev_codec)}'." - ) - if isinstance(codec, ArrayArrayCodec) and isinstance(prev_codec, BytesBytesCodec): - raise ValueError( - f"ArrayArrayCodec '{type(codec)}' cannot follow after BytesBytesCodec '{type(prev_codec)}'." - ) - prev_codec = codec - - if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(codecs) > 1: - warn( - "Combining a `sharding_indexed` codec disables partial reads and " - "writes, which may lead to inefficient performance.", - stacklevel=3, - ) - - return ( - tuple(codec for codec in codecs if isinstance(codec, ArrayArrayCodec)), - next(codec for codec in codecs if isinstance(codec, ArrayBytesCodec)), - tuple(codec for codec in codecs if isinstance(codec, BytesBytesCodec)), - ) + return type(self).from_list([c.evolve_from_array_spec(array_spec=array_spec) for c in self]) @classmethod - def from_list(cls, codecs: list[Codec], *, batch_size: int | None = None) -> Self: - array_array_codecs, array_bytes_codec, bytes_bytes_codecs = cls.codecs_from_list(codecs) + def from_list(cls, codecs: Iterable[Codec], *, batch_size: int | None = None) -> Self: + array_array_codecs, array_bytes_codec, bytes_bytes_codecs = codecs_from_list(codecs) return cls( array_array_codecs=array_array_codecs, @@ -180,9 +139,9 @@ def __iter__(self) -> Iterator[Codec]: yield self.array_bytes_codec yield from self.bytes_bytes_codecs - def validate(self, array_metadata: ArrayMetadata) -> None: + def validate(self, *, shape: ChunkCoords, dtype: np.dtype[Any], chunk_grid: ChunkGrid) -> None: for codec in self: - codec.validate(array_metadata) + codec.validate(shape=shape, dtype=dtype, chunk_grid=chunk_grid) def compute_encoded_size(self, byte_length: int, array_spec: ArraySpec) -> int: for codec in self: @@ -509,3 +468,64 @@ async def write( self.write_batch, config.get("async.concurrency"), ) + + +def codecs_from_list( + codecs: Iterable[Codec], +) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: + from zarr.codecs.sharding import ShardingCodec + + array_array: tuple[ArrayArrayCodec, ...] = () + array_bytes_maybe: ArrayBytesCodec | None = None + bytes_bytes: tuple[BytesBytesCodec, ...] = () + + if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(tuple(codecs)) > 1: + warn( + "Combining a `sharding_indexed` codec disables partial reads and " + "writes, which may lead to inefficient performance.", + stacklevel=3, + ) + + for prev_codec, cur_codec in pairwise((None, *codecs)): + if isinstance(cur_codec, ArrayArrayCodec): + if isinstance(prev_codec, ArrayBytesCodec | BytesBytesCodec): + msg = ( + f"Invalid codec order. ArrayArrayCodec {cur_codec}" + "must be preceded by another ArrayArrayCodec. " + f"Got {type(prev_codec)} instead." + ) + raise ValueError(msg) + array_array += (cur_codec,) + + elif isinstance(cur_codec, ArrayBytesCodec): + if isinstance(prev_codec, BytesBytesCodec): + msg = ( + f"Invalid codec order. ArrayBytes codec {cur_codec}" + f" must be preceded by an ArrayArrayCodec. Got {type(prev_codec)} instead." + ) + raise ValueError(msg) + + if array_bytes_maybe is not None: + msg = ( + f"Got two instances of ArrayBytesCodec: {array_bytes_maybe} and {cur_codec}. " + "Only one array-to-bytes codec is allowed." + ) + raise ValueError(msg) + + array_bytes_maybe = cur_codec + + elif isinstance(cur_codec, BytesBytesCodec): + if isinstance(prev_codec, ArrayArrayCodec): + msg = ( + f"Invalid codec order. BytesBytesCodec {cur_codec}" + "must be preceded by either another BytesBytesCodec, or an ArrayBytesCodec. " + f"Got {type(prev_codec)} instead." + ) + bytes_bytes += (cur_codec,) + else: + raise AssertionError + + if array_bytes_maybe is None: + raise ValueError("Required ArrayBytesCodec was not found.") + else: + return array_array, array_bytes_maybe, bytes_bytes diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 74ad5ac44f..def95b206d 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -5,7 +5,7 @@ from enum import Enum from functools import lru_cache from operator import itemgetter -from typing import TYPE_CHECKING, NamedTuple +from typing import TYPE_CHECKING, Any, NamedTuple import numpy as np import numpy.typing as npt @@ -15,12 +15,11 @@ ArrayBytesCodecPartialDecodeMixin, ArrayBytesCodecPartialEncodeMixin, Codec, - CodecPipeline, ) from zarr.abc.store import ByteGetter, ByteSetter from zarr.array_spec import ArraySpec from zarr.buffer import Buffer, BufferPrototype, NDBuffer, default_buffer_prototype -from zarr.chunk_grids import RegularChunkGrid +from zarr.chunk_grids import ChunkGrid, RegularChunkGrid from zarr.codecs.bytes import BytesCodec from zarr.codecs.crc32c_ import Crc32cCodec from zarr.codecs.pipeline import BatchedCodecPipeline @@ -34,7 +33,7 @@ product, ) from zarr.indexing import BasicIndexer, SelectorTuple, c_order_iter, get_indexer, morton_order_iter -from zarr.metadata import ArrayMetadata, parse_codecs +from zarr.metadata import parse_codecs if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Iterator @@ -298,34 +297,22 @@ class ShardingCodec( ArrayBytesCodec, ArrayBytesCodecPartialDecodeMixin, ArrayBytesCodecPartialEncodeMixin ): chunk_shape: ChunkCoords - codecs: CodecPipeline - index_codecs: CodecPipeline + codecs: tuple[Codec, ...] + index_codecs: tuple[Codec, ...] index_location: ShardingCodecIndexLocation = ShardingCodecIndexLocation.end def __init__( self, *, chunk_shape: ChunkCoordsLike, - codecs: Iterable[Codec | JSON] | None = None, - index_codecs: Iterable[Codec | JSON] | None = None, - index_location: ShardingCodecIndexLocation | None = ShardingCodecIndexLocation.end, + codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(),), + index_codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(), Crc32cCodec()), + index_location: ShardingCodecIndexLocation = ShardingCodecIndexLocation.end, ) -> None: chunk_shape_parsed = parse_shapelike(chunk_shape) - codecs_parsed = ( - parse_codecs(codecs) - if codecs is not None - else BatchedCodecPipeline.from_list([BytesCodec()]) - ) - index_codecs_parsed = ( - parse_codecs(index_codecs) - if index_codecs is not None - else BatchedCodecPipeline.from_list([BytesCodec(), Crc32cCodec()]) - ) - index_location_parsed = ( - parse_index_location(index_location) - if index_location is not None - else ShardingCodecIndexLocation.end - ) + codecs_parsed = parse_codecs(codecs) + index_codecs_parsed = parse_codecs(index_codecs) + index_location_parsed = parse_index_location(index_location) object.__setattr__(self, "chunk_shape", chunk_shape_parsed) object.__setattr__(self, "codecs", codecs_parsed) @@ -342,35 +329,39 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: _, configuration_parsed = parse_named_configuration(data, "sharding_indexed") return cls(**configuration_parsed) # type: ignore[arg-type] + @property + def codec_pipeline(self) -> BatchedCodecPipeline: + return BatchedCodecPipeline.from_list(self.codecs) + def to_dict(self) -> dict[str, JSON]: return { "name": "sharding_indexed", "configuration": { "chunk_shape": list(self.chunk_shape), - "codecs": self.codecs.to_dict(), - "index_codecs": self.index_codecs.to_dict(), + "codecs": [s.to_dict() for s in self.codecs], + "index_codecs": [s.to_dict() for s in self.index_codecs], "index_location": self.index_location, }, } def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: shard_spec = self._get_chunk_spec(array_spec) - evolved_codecs = self.codecs.evolve_from_array_spec(shard_spec) + evolved_codecs = tuple(c.evolve_from_array_spec(array_spec=shard_spec) for c in self.codecs) if evolved_codecs != self.codecs: return replace(self, codecs=evolved_codecs) return self - def validate(self, array_metadata: ArrayMetadata) -> None: - if len(self.chunk_shape) != array_metadata.ndim: + def validate(self, *, shape: ChunkCoords, dtype: np.dtype[Any], chunk_grid: ChunkGrid) -> None: + if len(self.chunk_shape) != len(shape): raise ValueError( "The shard's `chunk_shape` and array's `shape` need to have the same number of dimensions." ) - if not isinstance(array_metadata.chunk_grid, RegularChunkGrid): + if not isinstance(chunk_grid, RegularChunkGrid): raise ValueError("Sharding is only compatible with regular chunk grids.") if not all( s % c == 0 for s, c in zip( - array_metadata.chunk_grid.chunk_shape, + chunk_grid.chunk_shape, self.chunk_shape, strict=False, ) @@ -406,7 +397,7 @@ async def _decode_single( return out # decoding chunks and writing them into the output buffer - await self.codecs.read( + await self.codec_pipeline.read( [ ( _ShardingByteGetter(shard_dict, chunk_coords), @@ -474,7 +465,7 @@ async def _decode_partial_single( shard_dict[chunk_coords] = chunk_bytes # decoding chunks and writing them into the output buffer - await self.codecs.read( + await self.codec_pipeline.read( [ ( _ShardingByteGetter(shard_dict, chunk_coords), @@ -508,7 +499,7 @@ async def _encode_single( shard_builder = _ShardBuilder.create_empty(chunks_per_shard) - await self.codecs.write( + await self.codec_pipeline.write( [ ( _ShardingByteSetter(shard_builder, chunk_coords), @@ -551,7 +542,7 @@ async def _encode_partial_single( ) ) - await self.codecs.write( + await self.codec_pipeline.write( [ ( _ShardingByteSetter(shard_dict, chunk_coords), @@ -586,7 +577,7 @@ async def _decode_shard_index( ) -> _ShardIndex: index_array = next( iter( - await self.index_codecs.decode( + await BatchedCodecPipeline.from_list(self.index_codecs).decode( [(index_bytes, self._get_index_chunk_spec(chunks_per_shard))], ) ) @@ -597,7 +588,7 @@ async def _decode_shard_index( async def _encode_shard_index(self, index: _ShardIndex) -> Buffer: index_bytes = next( iter( - await self.index_codecs.encode( + await BatchedCodecPipeline.from_list(self.index_codecs).encode( [ ( NDBuffer.from_numpy_array(index.offsets_and_lengths), @@ -612,7 +603,7 @@ async def _encode_shard_index(self, index: _ShardIndex) -> Buffer: return index_bytes def _shard_index_size(self, chunks_per_shard: ChunkCoords) -> int: - return self.index_codecs.compute_encoded_size( + return BatchedCodecPipeline.from_list(self.index_codecs).compute_encoded_size( 16 * product(chunks_per_shard), self._get_index_chunk_spec(chunks_per_shard) ) diff --git a/src/zarr/codecs/transpose.py b/src/zarr/codecs/transpose.py index 33dab21fb6..0c55a6ec4a 100644 --- a/src/zarr/codecs/transpose.py +++ b/src/zarr/codecs/transpose.py @@ -2,13 +2,14 @@ from collections.abc import Iterable from dataclasses import dataclass, replace -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Any, cast import numpy as np from zarr.abc.codec import ArrayArrayCodec from zarr.array_spec import ArraySpec from zarr.buffer import NDBuffer +from zarr.chunk_grids import ChunkGrid from zarr.codecs.registry import register_codec from zarr.common import JSON, ChunkCoordsLike, parse_named_configuration @@ -45,8 +46,23 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: def to_dict(self) -> dict[str, JSON]: return {"name": "transpose", "configuration": {"order": list(self.order)}} + def validate(self, shape: tuple[int, ...], dtype: np.dtype[Any], chunk_grid: ChunkGrid) -> None: + if len(self.order) != len(shape): + raise ValueError( + f"The `order` tuple needs have as many entries as there are dimensions in the array. Got {self.order}." + ) + if len(self.order) != len(set(self.order)): + raise ValueError( + f"There must not be duplicates in the `order` tuple. Got {self.order}." + ) + if not all(0 <= x < len(shape) for x in self.order): + raise ValueError( + f"All entries in the `order` tuple must be between 0 and the number of dimensions in the array. Got {self.order}." + ) + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: - if len(self.order) != array_spec.ndim: + ndim = array_spec.ndim + if len(self.order) != ndim: raise ValueError( f"The `order` tuple needs have as many entries as there are dimensions in the array. Got {self.order}." ) @@ -54,7 +70,7 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: raise ValueError( f"There must not be duplicates in the `order` tuple. Got {self.order}." ) - if not all(0 <= x < array_spec.ndim for x in self.order): + if not all(0 <= x < ndim for x in self.order): raise ValueError( f"All entries in the `order` tuple must be between 0 and the number of dimensions in the array. Got {self.order}." ) diff --git a/src/zarr/common.py b/src/zarr/common.py index 9349f9f018..342db1412d 100644 --- a/src/zarr/common.py +++ b/src/zarr/common.py @@ -30,8 +30,6 @@ BytesLike = bytes | bytearray | memoryview ChunkCoords = tuple[int, ...] ChunkCoordsLike = Iterable[int] -SliceSelection = tuple[slice, ...] -Selection = slice | SliceSelection ZarrFormat = Literal[2, 3] JSON = None | str | int | float | Enum | dict[str, "JSON"] | list["JSON"] | tuple["JSON", ...] MemoryOrder = Literal["C", "F"] @@ -137,17 +135,21 @@ def parse_named_configuration( def parse_shapelike(data: int | Iterable[int]) -> tuple[int, ...]: if isinstance(data, int): + if data < 0: + raise ValueError(f"Expected a non-negative integer. Got {data} instead") return (data,) - if not isinstance(data, Iterable): - raise TypeError(f"Expected an iterable. Got {data} instead.") - data_tuple = tuple(data) - if len(data_tuple) == 0: - raise ValueError("Expected at least one element. Got 0.") + try: + data_tuple = tuple(data) + except TypeError as e: + msg = f"Expected an integer or an iterable of integers. Got {data} instead." + raise TypeError(msg) from e + if not all(isinstance(v, int) for v in data_tuple): - msg = f"Expected an iterable of integers. Got {type(data)} instead." + msg = f"Expected an iterable of integers. Got {data} instead." raise TypeError(msg) - if not all(lambda v: v > 0 for v in data_tuple): - raise ValueError(f"All values must be greater than 0. Got {data}.") + if not all(v > -1 for v in data_tuple): + msg = f"Expected all values to be non-negative. Got {data} instead." + raise ValueError(msg) return data_tuple diff --git a/src/zarr/config.py b/src/zarr/config.py index 7c5b48a16c..e711a98cb5 100644 --- a/src/zarr/config.py +++ b/src/zarr/config.py @@ -11,6 +11,7 @@ "array": {"order": "C"}, "async": {"concurrency": None, "timeout": None}, "codec_pipeline": {"batch_size": 1}, + "json_indent": 2, } ], ) diff --git a/src/zarr/group.py b/src/zarr/group.py index 4bb4b6b4dd..e6e2ac183f 100644 --- a/src/zarr/group.py +++ b/src/zarr/group.py @@ -25,6 +25,7 @@ ChunkCoords, ZarrFormat, ) +from zarr.config import config from zarr.store import StoreLike, StorePath, make_store_path from zarr.sync import SyncMixin, sync @@ -79,14 +80,21 @@ class GroupMetadata(Metadata): node_type: Literal["group"] = field(default="group", init=False) def to_buffer_dict(self) -> dict[str, Buffer]: + json_indent = config.get("json_indent") if self.zarr_format == 3: - return {ZARR_JSON: Buffer.from_bytes(json.dumps(self.to_dict()).encode())} + return { + ZARR_JSON: Buffer.from_bytes( + json.dumps(self.to_dict(), indent=json_indent).encode() + ) + } else: return { ZGROUP_JSON: Buffer.from_bytes( - json.dumps({"zarr_format": self.zarr_format}).encode() + json.dumps({"zarr_format": self.zarr_format}, indent=json_indent).encode() + ), + ZATTRS_JSON: Buffer.from_bytes( + json.dumps(self.attributes, indent=json_indent).encode() ), - ZATTRS_JSON: Buffer.from_bytes(json.dumps(self.attributes).encode()), } def __init__(self, attributes: dict[str, Any] | None = None, zarr_format: ZarrFormat = 3): diff --git a/src/zarr/indexing.py b/src/zarr/indexing.py index 98130fe0cd..74cbbe8c6b 100644 --- a/src/zarr/indexing.py +++ b/src/zarr/indexing.py @@ -23,39 +23,25 @@ import numpy as np import numpy.typing as npt +from zarr.buffer import NDArrayLike from zarr.common import ChunkCoords, product if TYPE_CHECKING: from zarr.array import Array - from zarr.buffer import NDArrayLike from zarr.chunk_grids import ChunkGrid +IntSequence = list[int] | npt.NDArray[np.intp] +ArrayOfIntOrBool = npt.NDArray[np.intp] | npt.NDArray[np.bool_] BasicSelector = int | slice | EllipsisType -BasicSelectorTuple = tuple[BasicSelector, ...] -BasicSelection = BasicSelector | BasicSelectorTuple -BasicSelectionNormalized = tuple[int | slice, ...] -CoordinateSelector = list[int] | npt.NDArray[np.intp] -CoordinateSelection = CoordinateSelector | tuple[CoordinateSelector, ...] -CoordinateSelectionNormalized = tuple[npt.NDArray[np.intp], ...] -BlockSelector = int | slice -BlockSelection = BlockSelector | tuple[BlockSelector, ...] -BlockSelectionNormalized = tuple[BlockSelector, ...] -MaskSelection = npt.NDArray[np.bool_] -OrthogonalSelector = int | slice | npt.NDArray[np.intp] | npt.NDArray[np.bool_] -OrthogonalSelection = OrthogonalSelector | tuple[OrthogonalSelector, ...] -OrthogonalSelectionNormalized = tuple[OrthogonalSelector, ...] +Selector = BasicSelector | ArrayOfIntOrBool -Selection = ( - BasicSelection | CoordinateSelection | BlockSelection | MaskSelection | OrthogonalSelection -) -SelectionNormalized = ( - BasicSelectionNormalized - | CoordinateSelectionNormalized - | BlockSelectionNormalized - | MaskSelection - | OrthogonalSelectionNormalized -) -Selector = int | slice | npt.NDArray[np.intp] | npt.NDArray[np.bool_] +BasicSelection = BasicSelector | tuple[BasicSelector, ...] # also used for BlockIndex +CoordinateSelection = IntSequence | tuple[IntSequence, ...] +MaskSelection = npt.NDArray[np.bool_] +OrthogonalSelection = Selector | tuple[Selector, ...] +Selection = BasicSelection | CoordinateSelection | MaskSelection | OrthogonalSelection +CoordinateSelectionNormalized = tuple[npt.NDArray[np.intp], ...] +SelectionNormalized = tuple[Selector, ...] | ArrayOfIntOrBool SelectionWithFields = Selection | str | Sequence[str] SelectorTuple = tuple[Selector, ...] | npt.NDArray[np.intp] | slice Fields = str | list[str] | tuple[str, ...] @@ -846,7 +832,7 @@ def __getitem__(self, selection: OrthogonalSelection) -> NDArrayLike: cast(OrthogonalSelection, new_selection), fields=fields ) - def __setitem__(self, selection: OrthogonalSelection, value: NDArrayLike) -> None: + def __setitem__(self, selection: OrthogonalSelection, value: npt.ArrayLike) -> None: fields, new_selection = pop_fields(selection) new_selection = ensure_tuple(new_selection) new_selection = replace_lists(new_selection) @@ -861,7 +847,7 @@ class BlockIndexer(Indexer): shape: ChunkCoords drop_axes: ChunkCoords - def __init__(self, selection: BlockSelection, shape: ChunkCoords, chunk_grid: ChunkGrid): + def __init__(self, selection: BasicSelection, shape: ChunkCoords, chunk_grid: ChunkGrid): chunk_shape = get_chunk_shape(chunk_grid) # handle ellipsis @@ -940,18 +926,18 @@ def __iter__(self) -> Iterator[ChunkProjection]: class BlockIndex: array: Array - def __getitem__(self, selection: BlockSelection) -> NDArrayLike: + def __getitem__(self, selection: BasicSelection) -> NDArrayLike: fields, new_selection = pop_fields(selection) new_selection = ensure_tuple(new_selection) new_selection = replace_lists(new_selection) - return self.array.get_block_selection(cast(BlockSelection, new_selection), fields=fields) + return self.array.get_block_selection(cast(BasicSelection, new_selection), fields=fields) - def __setitem__(self, selection: BlockSelection, value: NDArrayLike) -> None: + def __setitem__(self, selection: BasicSelection, value: npt.ArrayLike) -> None: fields, new_selection = pop_fields(selection) new_selection = ensure_tuple(new_selection) new_selection = replace_lists(new_selection) return self.array.set_block_selection( - cast(BlockSelection, new_selection), value, fields=fields + cast(BasicSelection, new_selection), value, fields=fields ) @@ -1138,7 +1124,7 @@ def __getitem__(self, selection: CoordinateSelection | MaskSelection) -> NDArray raise VindexInvalidSelectionError(new_selection) def __setitem__( - self, selection: CoordinateSelection | MaskSelection, value: NDArrayLike + self, selection: CoordinateSelection | MaskSelection, value: npt.ArrayLike ) -> None: fields, new_selection = pop_fields(selection) new_selection = ensure_tuple(new_selection) @@ -1206,8 +1192,8 @@ def pop_fields(selection: SelectionWithFields) -> tuple[Fields | None, Selection return fields, selection -def make_slice_selection(selection: Any) -> list[int | slice]: - ls: list[int | slice] = [] +def make_slice_selection(selection: Any) -> list[slice]: + ls: list[slice] = [] for dim_selection in selection: if is_integer(dim_selection): ls.append(slice(int(dim_selection), int(dim_selection) + 1, 1)) diff --git a/src/zarr/metadata.py b/src/zarr/metadata.py index 8329bd9200..729c7ba13c 100644 --- a/src/zarr/metadata.py +++ b/src/zarr/metadata.py @@ -10,12 +10,13 @@ import numpy as np import numpy.typing as npt -from zarr.abc.codec import Codec, CodecPipeline +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec, CodecPipeline from zarr.abc.metadata import Metadata from zarr.buffer import Buffer, BufferPrototype, default_buffer_prototype from zarr.chunk_grids import ChunkGrid, RegularChunkGrid from zarr.chunk_key_encodings import ChunkKeyEncoding, parse_separator -from zarr.codecs._v2 import V2Compressor, V2Filters +from zarr.codecs.registry import get_codec_class +from zarr.config import config if TYPE_CHECKING: from typing_extensions import Self @@ -32,6 +33,7 @@ ZarrFormat, parse_dtype, parse_fill_value, + parse_named_configuration, parse_shapelike, ) from zarr.config import parse_indexing_order @@ -131,11 +133,6 @@ def dtype(self) -> np.dtype[Any]: def ndim(self) -> int: pass - @property - @abstractmethod - def codec_pipeline(self) -> CodecPipeline: - pass - @abstractmethod def get_chunk_spec( self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype @@ -166,7 +163,7 @@ class ArrayV3Metadata(ArrayMetadata): chunk_grid: ChunkGrid chunk_key_encoding: ChunkKeyEncoding fill_value: Any - codecs: CodecPipeline + codecs: tuple[Codec, ...] attributes: dict[str, Any] = field(default_factory=dict) dimension_names: tuple[str, ...] | None = None zarr_format: Literal[3] = field(default=3, init=False) @@ -180,7 +177,7 @@ def __init__( chunk_grid: dict[str, JSON] | ChunkGrid, chunk_key_encoding: dict[str, JSON] | ChunkKeyEncoding, fill_value: Any, - codecs: Iterable[Codec | JSON], + codecs: Iterable[Codec | dict[str, JSON]], attributes: None | dict[str, JSON], dimension_names: None | Iterable[str], ) -> None: @@ -194,6 +191,7 @@ def __init__( dimension_names_parsed = parse_dimension_names(dimension_names) fill_value_parsed = parse_fill_value(fill_value) attributes_parsed = parse_attributes(attributes) + codecs_parsed_partial = parse_codecs(codecs) array_spec = ArraySpec( shape=shape_parsed, @@ -202,7 +200,7 @@ def __init__( order="C", # TODO: order is not needed here. prototype=default_buffer_prototype, # TODO: prototype is not needed here. ) - codecs_parsed = parse_codecs(codecs).evolve_from_array_spec(array_spec) + codecs_parsed = [c.evolve_from_array_spec(array_spec) for c in codecs_parsed_partial] object.__setattr__(self, "shape", shape_parsed) object.__setattr__(self, "data_type", data_type_parsed) @@ -228,7 +226,8 @@ def _validate_metadata(self) -> None: ) if self.fill_value is None: raise ValueError("`fill_value` is required.") - self.codecs.validate(self) + for codec in self.codecs: + codec.validate(shape=self.shape, dtype=self.data_type, chunk_grid=self.chunk_grid) @property def dtype(self) -> np.dtype[Any]: @@ -238,10 +237,6 @@ def dtype(self) -> np.dtype[Any]: def ndim(self) -> int: return len(self.shape) - @property - def codec_pipeline(self) -> CodecPipeline: - return self.codecs - def get_chunk_spec( self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype ) -> ArraySpec: @@ -272,8 +267,11 @@ def _json_convert(o: np.dtype[Any] | Enum | Codec) -> str | dict[str, Any]: return config raise TypeError + json_indent = config.get("json_indent") return { - ZARR_JSON: Buffer.from_bytes(json.dumps(self.to_dict(), default=_json_convert).encode()) + ZARR_JSON: Buffer.from_bytes( + json.dumps(self.to_dict(), default=_json_convert, indent=json_indent).encode() + ) } @classmethod @@ -371,14 +369,6 @@ def dtype(self) -> np.dtype[Any]: def chunks(self) -> ChunkCoords: return self.chunk_grid.chunk_shape - @property - def codec_pipeline(self) -> CodecPipeline: - from zarr.codecs import BatchedCodecPipeline - - return BatchedCodecPipeline.from_list( - [V2Filters(self.filters or []), V2Compressor(self.compressor)] - ) - def to_buffer_dict(self) -> dict[str, Buffer]: def _json_convert( o: np.dtype[Any], @@ -394,9 +384,12 @@ def _json_convert( assert isinstance(zarray_dict, dict) zattrs_dict = zarray_dict.pop("attributes", {}) assert isinstance(zattrs_dict, dict) + json_indent = config.get("json_indent") return { - ZARRAY_JSON: Buffer.from_bytes(json.dumps(zarray_dict, default=_json_convert).encode()), - ZATTRS_JSON: Buffer.from_bytes(json.dumps(zattrs_dict).encode()), + ZARRAY_JSON: Buffer.from_bytes( + json.dumps(zarray_dict, default=_json_convert, indent=json_indent).encode() + ), + ZATTRS_JSON: Buffer.from_bytes(json.dumps(zattrs_dict, indent=json_indent).encode()), } @classmethod @@ -500,9 +493,27 @@ def parse_v2_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata: return data -def parse_codecs(data: Iterable[Codec | JSON]) -> CodecPipeline: +def create_pipeline(data: Iterable[Codec | JSON]) -> CodecPipeline: from zarr.codecs import BatchedCodecPipeline if not isinstance(data, Iterable): raise TypeError(f"Expected iterable, got {type(data)}") return BatchedCodecPipeline.from_dict(data) + + +def parse_codecs(data: Iterable[Codec | dict[str, JSON]]) -> tuple[Codec, ...]: + out: tuple[Codec, ...] = () + + if not isinstance(data, Iterable): + raise TypeError(f"Expected iterable, got {type(data)}") + + for c in data: + if isinstance( + c, ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec + ): # Can't use Codec here because of mypy limitation + out += (c,) + else: + name_parsed, _ = parse_named_configuration(c, require_configuration=False) + out += (get_codec_class(name_parsed).from_dict(c),) + + return out diff --git a/tests/v3/test_codecs.py b/tests/v3/test_codecs.py index 514294c4b0..7cb0d0f804 100644 --- a/tests/v3/test_codecs.py +++ b/tests/v3/test_codecs.py @@ -21,9 +21,8 @@ TransposeCodec, ZstdCodec, ) -from zarr.common import Selection from zarr.config import config -from zarr.indexing import morton_order_iter +from zarr.indexing import Selection, morton_order_iter from zarr.store import MemoryStore, StorePath from zarr.testing.utils import assert_bytes_equal diff --git a/tests/v3/test_common.py b/tests/v3/test_common.py index cc33aa75cf..bb59789843 100644 --- a/tests/v3/test_common.py +++ b/tests/v3/test_common.py @@ -14,28 +14,28 @@ @pytest.mark.parametrize("data", [(0, 0, 0, 0), (1, 3, 4, 5, 6), (2, 4)]) -def test_product(data: tuple[int, ...]): +def test_product(data: tuple[int, ...]) -> None: assert product(data) == np.prod(data) # todo: test -def test_concurrent_map(): ... +def test_concurrent_map() -> None: ... # todo: test -def test_to_thread(): ... +def test_to_thread() -> None: ... # todo: test -def test_enum_names(): ... +def test_enum_names() -> None: ... # todo: test -def test_parse_enum(): ... +def test_parse_enum() -> None: ... @pytest.mark.parametrize("data", [("foo", "bar"), (10, 11)]) -def test_parse_name_invalid(data: tuple[Any, Any]): +def test_parse_name_invalid(data: tuple[Any, Any]) -> None: observed, expected = data if isinstance(observed, str): with pytest.raises(ValueError, match=f"Expected '{expected}'. Got {observed} instead."): @@ -48,47 +48,71 @@ def test_parse_name_invalid(data: tuple[Any, Any]): @pytest.mark.parametrize("data", [("foo", "foo"), ("10", "10")]) -def test_parse_name_valid(data: tuple[Any, Any]): +def test_parse_name_valid(data: tuple[Any, Any]) -> None: observed, expected = data assert parse_name(observed, expected) == observed @pytest.mark.parametrize("data", [0, 1, "hello", "f"]) -def test_parse_indexing_order_invalid(data): +def test_parse_indexing_order_invalid(data: Any) -> None: with pytest.raises(ValueError, match="Expected one of"): parse_indexing_order(data) @pytest.mark.parametrize("data", ["C", "F"]) -def parse_indexing_order_valid(data: Literal["C", "F"]): +def parse_indexing_order_valid(data: Literal["C", "F"]) -> None: assert parse_indexing_order(data) == data -@pytest.mark.parametrize("data", [("0", 1, 2, 3), {"0": "0"}, []]) -def test_parse_shapelike_invalid(data: Any): - if isinstance(data, Iterable): - if len(data) == 0: - with pytest.raises(ValueError, match="Expected at least one element."): - parse_shapelike(data) - else: - with pytest.raises(TypeError, match="Expected an iterable of integers"): - parse_shapelike(data) - else: - with pytest.raises(TypeError, match="Expected an iterable."): - parse_shapelike(data) +@pytest.mark.parametrize("data", [lambda v: v, slice(None)]) +def test_parse_shapelike_invalid_single_type(data: Any) -> None: + """ + Test that we get the expected error message when passing in a value that is not an integer + or an iterable of integers. + """ + with pytest.raises(TypeError, match="Expected an integer or an iterable of integers."): + parse_shapelike(data) + + +def test_parse_shapelike_invalid_single_value() -> None: + """ + Test that we get the expected error message when passing in a negative integer. + """ + with pytest.raises(ValueError, match="Expected a non-negative integer."): + parse_shapelike(-1) + + +@pytest.mark.parametrize("data", ["shape", ("0", 1, 2, 3), {"0": "0"}, ((1, 2), (2, 2)), (4.0, 2)]) +def test_parse_shapelike_invalid_iterable_types(data: Any) -> None: + """ + Test that we get the expected error message when passing in an iterable containing + non-integer elements + """ + with pytest.raises(TypeError, match="Expected an iterable of integers"): + parse_shapelike(data) + + +@pytest.mark.parametrize("data", [(1, 2, 3, -1), (-10,)]) +def test_parse_shapelike_invalid_iterable_values(data: Any) -> None: + """ + Test that we get the expected error message when passing in an iterable containing negative + integers + """ + with pytest.raises(ValueError, match="Expected all values to be non-negative."): + parse_shapelike(data) -@pytest.mark.parametrize("data", [range(10), [0, 1, 2, 3], (3, 4, 5)]) -def test_parse_shapelike_valid(data: Iterable[Any]): +@pytest.mark.parametrize("data", [range(10), [0, 1, 2, 3], (3, 4, 5), ()]) +def test_parse_shapelike_valid(data: Iterable[int]) -> None: assert parse_shapelike(data) == tuple(data) # todo: more dtypes @pytest.mark.parametrize("data", [("uint8", np.uint8), ("float64", np.float64)]) -def parse_dtype(data: tuple[str, np.dtype]): +def parse_dtype(data: tuple[str, np.dtype]) -> None: unparsed, parsed = data assert parse_dtype(unparsed) == parsed # todo: figure out what it means to test this -def test_parse_fill_value(): ... +def test_parse_fill_value() -> None: ... diff --git a/tests/v3/test_config.py b/tests/v3/test_config.py index aed9775d17..684ab0dfce 100644 --- a/tests/v3/test_config.py +++ b/tests/v3/test_config.py @@ -1,19 +1,32 @@ +from typing import Any + +import pytest + from zarr.config import config -def test_config_defaults_set(): +def test_config_defaults_set() -> None: # regression test for available defaults assert config.defaults == [ { "array": {"order": "C"}, "async": {"concurrency": None, "timeout": None}, "codec_pipeline": {"batch_size": 1}, + "json_indent": 2, } ] assert config.get("array.order") == "C" + assert config.get("async.concurrency") is None + assert config.get("async.timeout") is None + assert config.get("codec_pipeline.batch_size") == 1 + assert config.get("json_indent") == 2 -def test_config_defaults_can_be_overridden(): - assert config.get("array.order") == "C" - with config.set({"array.order": "F"}): - assert config.get("array.order") == "F" +@pytest.mark.parametrize( + "key, old_val, new_val", + [("array.order", "C", "F"), ("async.concurrency", None, 10), ("json_indent", 2, 0)], +) +def test_config_defaults_can_be_overridden(key: str, old_val: Any, new_val: Any) -> None: + assert config.get(key) == old_val + with config.set({key: new_val}): + assert config.get(key) == new_val diff --git a/tests/v3/test_indexing.py b/tests/v3/test_indexing.py index 00ea947b49..13a7d953e1 100644 --- a/tests/v3/test_indexing.py +++ b/tests/v3/test_indexing.py @@ -42,7 +42,7 @@ def zarr_array_from_numpy_array( chunk_shape=chunk_shape or a.shape, chunk_key_encoding=("v2", "."), ) - z[:] = a + z[()] = a return z @@ -111,42 +111,55 @@ def test_replace_ellipsis(): ) -@pytest.mark.xfail(reason="zero-dimension arrays are not supported in v3") -def test_get_basic_selection_0d(store: StorePath): +@pytest.mark.parametrize( + "value, dtype", + [ + (42, "uint8"), + pytest.param( + (b"aaa", 1, 4.2), [("foo", "S3"), ("bar", "i4"), ("baz", "f8")], marks=pytest.mark.xfail + ), + ], +) +@pytest.mark.parametrize("use_out", (True, False)) +def test_get_basic_selection_0d(store: StorePath, use_out: bool, value: Any, dtype: Any) -> None: # setup - a = np.array(42) - z = zarr_array_from_numpy_array(store, a) + arr_np = np.array(value, dtype=dtype) + arr_z = zarr_array_from_numpy_array(store, arr_np) - assert_array_equal(a, z.get_basic_selection(Ellipsis)) - assert_array_equal(a, z[...]) - assert 42 == z.get_basic_selection(()) - assert 42 == z[()] + assert_array_equal(arr_np, arr_z.get_basic_selection(Ellipsis)) + assert_array_equal(arr_np, arr_z[...]) + assert value == arr_z.get_basic_selection(()) + assert value == arr_z[()] - # test out param - b = NDBuffer.from_numpy_array(np.zeros_like(a)) - z.get_basic_selection(Ellipsis, out=b) - assert_array_equal(a, b) + if use_out: + # test out param + b = NDBuffer.from_numpy_array(np.zeros_like(arr_np)) + arr_z.get_basic_selection(Ellipsis, out=b) + assert_array_equal(arr_np, b.as_ndarray_like()) + + # todo: uncomment the structured array tests when we can make them pass, + # or delete them if we formally decide not to support structured dtypes. # test structured array - value = (b"aaa", 1, 4.2) - a = np.array(value, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) - z = zarr_array_from_numpy_array(store, a) - z[()] = value - assert_array_equal(a, z.get_basic_selection(Ellipsis)) - assert_array_equal(a, z[...]) - assert a[()] == z.get_basic_selection(()) - assert a[()] == z[()] - assert b"aaa" == z.get_basic_selection((), fields="foo") - assert b"aaa" == z["foo"] - assert a[["foo", "bar"]] == z.get_basic_selection((), fields=["foo", "bar"]) - assert a[["foo", "bar"]] == z["foo", "bar"] - # test out param - b = NDBuffer.from_numpy_array(np.zeros_like(a)) - z.get_basic_selection(Ellipsis, out=b) - assert_array_equal(a, b) - c = NDBuffer.from_numpy_array(np.zeros_like(a[["foo", "bar"]])) - z.get_basic_selection(Ellipsis, out=c, fields=["foo", "bar"]) - assert_array_equal(a[["foo", "bar"]], c) + # value = (b"aaa", 1, 4.2) + # a = np.array(value, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) + # z = zarr_array_from_numpy_array(store, a) + # z[()] = value + # assert_array_equal(a, z.get_basic_selection(Ellipsis)) + # assert_array_equal(a, z[...]) + # assert a[()] == z.get_basic_selection(()) + # assert a[()] == z[()] + # assert b"aaa" == z.get_basic_selection((), fields="foo") + # assert b"aaa" == z["foo"] + # assert a[["foo", "bar"]] == z.get_basic_selection((), fields=["foo", "bar"]) + # assert a[["foo", "bar"]] == z["foo", "bar"] + # # test out param + # b = NDBuffer.from_numpy_array(np.zeros_like(a)) + # z.get_basic_selection(Ellipsis, out=b) + # assert_array_equal(a, b) + # c = NDBuffer.from_numpy_array(np.zeros_like(a[["foo", "bar"]])) + # z.get_basic_selection(Ellipsis, out=c, fields=["foo", "bar"]) + # assert_array_equal(a[["foo", "bar"]], c) basic_selections_1d = [ @@ -466,51 +479,46 @@ def test_fancy_indexing_doesnt_mix_with_implicit_slicing(store: StorePath): np.testing.assert_array_equal(z2[..., [1, 2, 3]], 0) -@pytest.mark.xfail(reason="zero-dimension arrays are not supported in v3") -def test_set_basic_selection_0d(store: StorePath): - # setup - v = np.array(42) - a = np.zeros_like(v) - z = zarr_array_from_numpy_array(store, v) - assert_array_equal(a, z[:]) - - # tests - z.set_basic_selection(Ellipsis, v) - assert_array_equal(v, z[:]) - z[...] = 0 - assert_array_equal(a, z[:]) - z[...] = v - assert_array_equal(v, z[:]) - - # test structured array - value = (b"aaa", 1, 4.2) - v = np.array(value, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) - a = np.zeros_like(v) - z = zarr_array_from_numpy_array(store, a) - - # tests - z.set_basic_selection(Ellipsis, v) - assert_array_equal(v, z[:]) - z.set_basic_selection(Ellipsis, a) - assert_array_equal(a, z[:]) - z[...] = v - assert_array_equal(v, z[:]) - z[...] = a - assert_array_equal(a, z[:]) - # with fields - z.set_basic_selection(Ellipsis, v["foo"], fields="foo") - assert v["foo"] == z["foo"] - assert a["bar"] == z["bar"] - assert a["baz"] == z["baz"] - z["bar"] = v["bar"] - assert v["foo"] == z["foo"] - assert v["bar"] == z["bar"] - assert a["baz"] == z["baz"] - # multiple field assignment not supported - with pytest.raises(IndexError): - z.set_basic_selection(Ellipsis, v[["foo", "bar"]], fields=["foo", "bar"]) - with pytest.raises(IndexError): - z[..., "foo", "bar"] = v[["foo", "bar"]] +@pytest.mark.parametrize( + "value, dtype", + [ + (42, "uint8"), + pytest.param( + (b"aaa", 1, 4.2), [("foo", "S3"), ("bar", "i4"), ("baz", "f8")], marks=pytest.mark.xfail + ), + ], +) +def test_set_basic_selection_0d( + store: StorePath, value: Any, dtype: str | list[tuple[str, str]] +) -> None: + arr_np = np.array(value, dtype=dtype) + arr_np_zeros = np.zeros_like(arr_np, dtype=dtype) + arr_z = zarr_array_from_numpy_array(store, arr_np_zeros) + assert_array_equal(arr_np_zeros, arr_z) + + arr_z.set_basic_selection(Ellipsis, value) + assert_array_equal(value, arr_z) + arr_z[...] = 0 + assert_array_equal(arr_np_zeros, arr_z) + arr_z[...] = value + assert_array_equal(value, arr_z) + + # todo: uncomment the structured array tests when we can make them pass, + # or delete them if we formally decide not to support structured dtypes. + + # arr_z.set_basic_selection(Ellipsis, v["foo"], fields="foo") + # assert v["foo"] == arr_z["foo"] + # assert arr_np_zeros["bar"] == arr_z["bar"] + # assert arr_np_zeros["baz"] == arr_z["baz"] + # arr_z["bar"] = v["bar"] + # assert v["foo"] == arr_z["foo"] + # assert v["bar"] == arr_z["bar"] + # assert arr_np_zeros["baz"] == arr_z["baz"] + # # multiple field assignment not supported + # with pytest.raises(IndexError): + # arr_z.set_basic_selection(Ellipsis, v[["foo", "bar"]], fields=["foo", "bar"]) + # with pytest.raises(IndexError): + # arr_z[..., "foo", "bar"] = v[["foo", "bar"]] def _test_get_orthogonal_selection(a, z, selection):