diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 802a1428..cee9dacb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,9 +2,9 @@ name: build on: push: - branches: ["main", "v[0-9]+.[0-9]+"] + branches: ["main", "v[0-9]+.[0-9]+.x"] pull_request: - branches: ["main", "v[0-9]+.[0-9]+"] + branches: ["main", "v[0-9]+.[0-9]+.x"] env: CARGO_TERM_COLOR: always @@ -87,16 +87,16 @@ jobs: - uses: Swatinem/rust-cache@v2 - run: cargo install cargo-hack cargo-minimal-versions --locked - run: cargo minimal-versions check --workspace --all-features --direct -# codecov: -# runs-on: ubuntu-latest -# steps: -# - uses: actions/checkout@v4 -# - run: sudo apt update && sudo apt install -y cmake clang-15 -# - uses: dtolnay/rust-toolchain@nightly -# - uses: Swatinem/rust-cache@v2 -# - run: cargo +nightly install cargo-llvm-cov --locked -# - run: cargo +nightly llvm-cov --all-features --doctests --lcov --output-path lcov.info -# - name: Upload coverage reports to Codecov -# uses: codecov/codecov-action@v4 -# with: -# token: ${{ secrets.CODECOV_TOKEN }} + codecov: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - run: sudo apt update && sudo apt install -y cmake clang-15 + - uses: dtolnay/rust-toolchain@nightly + - uses: Swatinem/rust-cache@v2 + - run: cargo +nightly install cargo-llvm-cov --locked + - run: cargo +nightly llvm-cov --all-features --doctests --lcov --output-path lcov.info + - name: Upload coverage reports to Codecov + uses: codecov/codecov-action@v4 + with: + token: ${{ secrets.CODECOV_TOKEN }} diff --git a/CHANGELOG.md b/CHANGELOG.md index b01ad5de..111fe991 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,49 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- Add `array:codec::{InvalidBytesLengthError,InvalidArrayShapeError,InvalidNumberOfElementsError,SubsetOutOfBoundsError}` +- Add `ArraySubset::inbounds_shape()` (matches the old `ArraySubset::inbounds` behaviour) +- Add `ArrayBytesFixedDisjointView[CreateError]` + +### Changed +- **Breaking**: change `ArraySubset::inbounds` to take another subset rather than a shape +- **Breaking**: `CodecError` enum changes: + - Change `CodecError::UnexpectedChunkDecodedSize` to an `InvalidBytesLengthError` + - Add `CodecError::{InvalidArrayShape,InvalidNumberOfElements,SubsetOutOfBounds,RawBytesOffsetsCreate,RawBytesOffsetsOutOfBounds}` +- **Breaking**: Change output args to `ArrayBytesFixedDisjointView` and make safe the following: + - `Array::[async_]retrieve_chunk[_subset]_into` + - `[Async]ArrayPartialDecoderTraits::partial_decode_into` + - `ArrayToBytesCodecTraits::decode_into` + - `zarrs::array::copy_fill_value_into` + - `zarrs::array::update_array_bytes` +- **Breaking**: change `RawBytesOffsets` into a validated newtype +- **Breaking**: `ArrayBytes::new_vlen()` not returns a `Result` and validates bytes/offsets compatibility +- Reenable broken compatibility tests since fixed in `zarr-python`/`numcodecs` +- **Breaking**: move the `zarrs::array::{data_type,fill_value}` modules into the `zarrs_data_type` crate +- Bump `lru` to 0.13 + +## [0.19.2] - 2025-02-13 + +### Changed +- Bump `zarrs_metadata` to 0.3.4 which includes a number of Zarr metadata fixes + - See the [`zarrs_metadata` CHANGELOG.md](https://github.com/LDeakin/zarrs/blob/main/zarrs_metadata/CHANGELOG.md) + +## [0.19.1] - 2025-01-19 + +### Added +- Document that elements in `ArrayBytes` must be in C-contiguous order + +### Changed +- Use new language/library features added between Rust 1.78-1.82 (internal) +- Cleanup root docs and README removing ZEPs table and ecosystem table + +### Fixed +- New clippy lints +- Mark `String` and `Bytes` data types as experimental in their docs +- Mark `rectangular` chunk grid as experimental since it is based on a draft ZEP +- Add missing invariant to `[partial_]decode_into` safety docs + ## [0.19.0] - 2025-01-10 ### Highlights @@ -1215,7 +1258,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Initial public release -[unreleased]: https://github.com/LDeakin/zarrs/compare/zarrs-v0.19.0...HEAD +[unreleased]: https://github.com/LDeakin/zarrs/compare/zarrs-v0.19.2...HEAD +[0.19.2]: https://github.com/LDeakin/zarrs/releases/tag/zarrs-v0.19.2 +[0.19.1]: https://github.com/LDeakin/zarrs/releases/tag/zarrs-v0.19.1 [0.19.0]: https://github.com/LDeakin/zarrs/releases/tag/zarrs-v0.19.0 [0.18.3]: https://github.com/LDeakin/zarrs/releases/tag/zarrs-v0.18.3 [0.18.2]: https://github.com/LDeakin/zarrs/releases/tag/zarrs-v0.18.2 diff --git a/CITATION.cff b/CITATION.cff index 44437c92..86bd8679 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,8 +1,8 @@ cff-version: 1.2.0 message: "If you use this software, please cite it as below." title: "zarrs" -version: 0.19.0 -date-released: 2025-01-10 +version: 0.19.2 +date-released: 2025-02-13 repository-code: "https://github.com/LDeakin/zarrs" url: "https://zarrs.dev" abstract: "zarrs is a Rust library for the Zarr storage format for multidimensional arrays and metadata." diff --git a/Cargo.toml b/Cargo.toml index 41825a07..dfbcfaf6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ resolver = "2" members = [ "zarrs", + "zarrs_data_type", "zarrs_metadata", "zarrs_storage", "zarrs_filesystem", @@ -26,8 +27,12 @@ module_name_repetitions = "allow" missing_panics_doc = "warn" missing_errors_doc = "warn" +[workspace.dependencies.zarrs_data_type] +version = "0.1.0" +path = "zarrs_data_type" + [workspace.dependencies.zarrs_metadata] -version = "0.3.0" +version = "0.3.4" path = "zarrs_metadata" [workspace.dependencies.zarrs_storage] @@ -62,3 +67,10 @@ version = "0.51.0" [workspace.dependencies.zip] version = "2.1.3" + +[workspace.dependencies.half] +version = "2.0.0" +features = ["bytemuck"] + +[workspace.dependencies.num] +version = "0.4.1" diff --git a/README.md b/README.md index 9435fedc..31b03404 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,29 @@ # zarrs [![Latest Version](https://img.shields.io/crates/v/zarrs.svg)](https://crates.io/crates/zarrs) -[![zarrs documentation](https://docs.rs/zarrs/badge.svg)](https://docs.rs/zarrs) +[![zarrs documentation](https://docs.rs/zarrs/badge.svg)][documentation] ![msrv](https://img.shields.io/crates/msrv/zarrs) [![downloads](https://img.shields.io/crates/d/zarrs)](https://crates.io/crates/zarrs) [![build](https://github.com/LDeakin/zarrs/actions/workflows/ci.yml/badge.svg)](https://github.com/LDeakin/zarrs/actions/workflows/ci.yml) [![codecov](https://codecov.io/gh/LDeakin/zarrs/graph/badge.svg?token=OBKJQNAZPP)](https://codecov.io/gh/LDeakin/zarrs) [![DOI](https://zenodo.org/badge/695021547.svg)](https://zenodo.org/badge/latestdoi/695021547) -`zarrs` is a Rust library for the [Zarr](https://zarr.dev) storage format for multidimensional arrays and metadata. It supports [Zarr V3](https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html) and a [V3 compatible subset](https://docs.rs/zarrs/latest/zarrs/#implementation-status) of [Zarr V2](https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html). +`zarrs` is a Rust library for the [Zarr] storage format for multidimensional arrays and metadata. It supports [Zarr V3] and a V3 compatible subset of [Zarr V2]. -A changelog can be found [here](https://github.com/LDeakin/zarrs/blob/main/CHANGELOG.md). -Correctness issues with past versions are [detailed here](https://github.com/LDeakin/zarrs/blob/main/doc/correctness_issues.md). +A changelog can be found [here][CHANGELOG]. +Correctness issues with past versions are [detailed here][correctness_issues]. -Developed at the [Department of Materials Physics](https://physics.anu.edu.au/research/mp/), Australian National University, Canberra, Australia. +Developed at the [Department of Materials Physics, Australian National University, Canberra, Australia]. > [!TIP] -> If you are a Python user, check out [`zarrs-python`](https://github.com/ilan-gold/zarrs-python). -> It includes a high-performance codec pipeline for the reference [`zarr-python`](https://github.com/zarr-developers/zarr-python) implementation. +> If you are a Python user, check out [`zarrs-python`]. +> It includes a high-performance codec pipeline for the reference [`zarr-python`] implementation. ## Getting Started -- Review the [implementation status](https://docs.rs/zarrs/latest/zarrs/#implementation-status), [array support](https://docs.rs/zarrs/latest/zarrs/#array-support), and [storage support](https://docs.rs/zarrs/latest/zarrs/#storage-support). -- Read [The `zarrs` Book](https://book.zarrs.dev). -- View the [examples](https://github.com/LDeakin/zarrs/tree/main/zarrs/examples) and [the example below](#example). -- Read the [documentation](https://docs.rs/zarrs/latest/zarrs/). [`array::Array`](https://docs.rs/zarrs/latest/zarrs/array/struct.Array.html) is a good place to start. -- Check out the [`zarrs` ecosystem](#zarrs-ecosystem). +- Review the [implementation status] ([zarr version support], [array support], [storage support], and the [`zarrs` ecosystem](#zarrs-ecosystem)). +- Read [The `zarrs` Book]. +- View the [examples] and [the example below](#example). +- Read the [documentation]. ## Example ```rust @@ -90,61 +89,31 @@ println!("{array_ndarray:4}"); ## `zarrs` Ecosystem -| Crate | Docs / Description | -| --------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- | -| **Core** | | -| [![zarrs_ver]](https://crates.io/crates/zarrs) [zarrs] | [![docs]](https://docs.rs/zarrs) The core library for manipulating Zarr hierarchies | -| [![zarrs_metadata_ver]](https://crates.io/crates/zarrs_metadata) [zarrs_metadata] | [![docs]](https://docs.rs/zarrs_metadata) Zarr metadata support (re-exported as `zarrs::metadata`) | -| [![zarrs_storage_ver]](https://crates.io/crates/zarrs_storage) [zarrs_storage] | [![docs]](https://docs.rs/zarrs_storage) The storage API for `zarrs` (re-exported as `zarrs::storage`) | -| **Stores** | | -| [![zarrs_filesystem_ver]](https://crates.io/crates/zarrs_filesystem) [zarrs_filesystem] | [![docs]](https://docs.rs/zarrs_filesystem) A filesystem store (re-exported as `zarrs::filesystem`) | -| [![zarrs_object_store_ver]](https://crates.io/crates/zarrs_object_store) [zarrs_object_store] | [![docs]](https://docs.rs/zarrs_object_store) [`object_store`](https://docs.rs/object_store/latest/object_store/) store support | -| [![zarrs_opendal_ver]](https://crates.io/crates/zarrs_opendal) [zarrs_opendal] | [![docs]](https://docs.rs/zarrs_opendal) [`opendal`](https://docs.rs/opendal/latest/opendal/) store support | -| [![zarrs_http_ver]](https://crates.io/crates/zarrs_http) [zarrs_http] | [![docs]](https://docs.rs/zarrs_http) A synchronous http store | -| [![zarrs_zip_ver]](https://crates.io/crates/zarrs_zip) [zarrs_zip] | [![docs]](https://docs.rs/zarrs_zip) A storage adapter for zip files | -| [![zarrs_icechunk_ver]](https://crates.io/crates/zarrs_icechunk) [zarrs_icechunk] | [![docs]](https://docs.rs/zarrs_icechunk) [`icechunk`](https://docs.rs/icechunk/latest/icechunk/) store support | -| **Bindings** | | -| [![zarrs_python_ver]](https://pypi.org/project/zarrs/) [zarrs-python] | [![docs]](https://zarrs-python.readthedocs.io/en/latest/) A codec pipeline for [zarr-python] | -| [![zarrs_ffi_ver]](https://crates.io/crates/zarrs_ffi) [zarrs_ffi] | [![docs]](https://docs.rs/zarrs_ffi) A subset of `zarrs` exposed as a C/C++ API | -| **Zarr Metadata Conventions** | | -| [![ome_zarr_metadata_ver]](https://crates.io/crates/ome_zarr_metadata) [ome_zarr_metadata] | [![docs]](https://docs.rs/ome_zarr_metadata) A library for OME-Zarr (previously OME-NGFF) metadata | - -[docs]: https://img.shields.io/badge/docs-brightgreen -[zarrs_ver]: https://img.shields.io/crates/v/zarrs -[zarrs]: https://github.com/LDeakin/zarrs/tree/main/zarrs -[zarrs_metadata_ver]: https://img.shields.io/crates/v/zarrs_metadata -[zarrs_metadata]: https://github.com/LDeakin/zarrs/tree/main/zarrs_metadata -[zarrs_storage_ver]: https://img.shields.io/crates/v/zarrs_storage -[zarrs_storage]: https://github.com/LDeakin/zarrs/tree/main/zarrs_storage -[zarrs_filesystem_ver]: https://img.shields.io/crates/v/zarrs_filesystem -[zarrs_filesystem]: https://github.com/LDeakin/zarrs/tree/main/zarrs_filesystem -[zarrs_http_ver]: https://img.shields.io/crates/v/zarrs_http -[zarrs_http]: https://github.com/LDeakin/zarrs/tree/main/zarrs_http -[zarrs_object_store_ver]: https://img.shields.io/crates/v/zarrs_object_store -[zarrs_object_store]: https://github.com/LDeakin/zarrs/tree/main/zarrs_object_store -[zarrs_opendal_ver]: https://img.shields.io/crates/v/zarrs_opendal -[zarrs_opendal]: https://github.com/LDeakin/zarrs/tree/main/zarrs_opendal -[zarrs_zip_ver]: https://img.shields.io/crates/v/zarrs_zip -[zarrs_zip]: https://github.com/LDeakin/zarrs/tree/main/zarrs_zip -[zarrs_icechunk_ver]: https://img.shields.io/crates/v/zarrs_icechunk -[zarrs_icechunk]: https://github.com/LDeakin/zarrs_icechunk -[zarrs_ffi_ver]: https://img.shields.io/crates/v/zarrs_ffi -[zarrs_ffi]: https://github.com/LDeakin/zarrs_ffi -[zarrs_python_ver]: https://img.shields.io/pypi/v/zarrs -[zarrs-python]: https://github.com/ilan-gold/zarrs-python -[zarr-python]: https://github.com/zarr-developers/zarr-python -[ome_zarr_metadata_ver]: https://img.shields.io/crates/v/ome_zarr_metadata -[ome_zarr_metadata]: https://github.com/LDeakin/rust_ome_zarr_metadata - -#### [zarrs_tools] -[![zarrs_tools_ver]](https://crates.io/crates/zarrs_tools) [![zarrs_tools_doc]](https://docs.rs/zarrs_tools) - -[zarrs_tools]: https://github.com/LDeakin/zarrs_tools -[zarrs_tools_ver]: https://img.shields.io/crates/v/zarrs_tools.svg -[zarrs_tools_doc]: https://docs.rs/zarrs_tools/badge.svg - +### Core +- [`zarrs`]: The core library for manipulating Zarr hierarchies. +- [`zarrs_data_type`]: Zarr data types (re-exported as `zarrs::data_type`). +- [`zarrs_metadata`]: Zarr metadata support (re-exported as `zarrs::metadata`). +- [`zarrs_storage`]: The storage API for `zarrs` (re-exported as `zarrs::storage`). + +### Stores +- [`zarrs_filesystem`]: A filesystem store (re-exported as `zarrs::filesystem`). +- [`zarrs_object_store`]: [`object_store`] store support. +- [`zarrs_opendal`]: [`opendal`] store support. +- [`zarrs_http`]: A synchronous http store. +- [`zarrs_zip`]: A storage adapter for zip files. +- [`zarrs_icechunk`]: [`icechunk`] store support. + +### Bindings +- [`zarrs-python`]: A high-performance codec pipeline for [`zarr-python`]. +- [`zarrs_ffi`]: A subset of `zarrs` exposed as a C/C++ API. + +### Zarr Metadata Conventions +- [`ome_zarr_metadata`]: A library for OME-Zarr (previously OME-NGFF) metadata. + +### Tools +- [`zarrs_tools`]: Various tools for creating and manipulating Zarr V3 data with the zarrs rust crate - A reencoder that can change codecs, chunk shape, convert Zarr V2 to V3, etc. - - Create an [OME-Zarr](https://ngff.openmicroscopy.org/latest/) hierarchy from a Zarr array. + - Create an [OME-Zarr] hierarchy from a Zarr array. - Transform arrays: crop, rescale, downsample, gradient magnitude, gaussian, noise filtering, etc. - Benchmarking tools and performance benchmarks of `zarrs`. @@ -154,3 +123,39 @@ println!("{array_ndarray:4}"); - the MIT license [LICENSE-MIT](./LICENCE-MIT) or , at your option. Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions. + +[CHANGELOG]: https://github.com/LDeakin/zarrs/blob/main/CHANGELOG.md +[correctness_issues]: https://github.com/LDeakin/zarrs/blob/main/doc/correctness_issues.md +[implementation status]: https://docs.rs/zarrs/latest/zarrs/#implementation-status +[zarr version support]: https://docs.rs/zarrs/latest/zarrs/#zarr-version-support +[array support]: https://docs.rs/zarrs/latest/zarrs/#array-support +[storage support]: https://docs.rs/zarrs/latest/zarrs/#storage-support +[examples]: https://github.com/LDeakin/zarrs/tree/main/zarrs/examples +[documentation]: https://docs.rs/zarrs/latest/zarrs/ +[The `zarrs` Book]: https://book.zarrs.dev + +[`zarrs`]: https://github.com/LDeakin/zarrs/tree/main/zarrs +[`zarrs_data_type`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_data_type +[`zarrs_metadata`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_metadata +[`zarrs_storage`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_storage +[`zarrs_filesystem`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_filesystem +[`zarrs_http`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_http +[`zarrs_object_store`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_object_store +[`zarrs_opendal`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_opendal +[`zarrs_zip`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_zip +[`zarrs_icechunk`]: https://github.com/LDeakin/zarrs_icechunk +[`zarrs_ffi`]: https://github.com/LDeakin/zarrs_ffi +[`zarrs-python`]: https://github.com/ilan-gold/zarrs-python +[`zarr-python`]: https://github.com/zarr-developers/zarr-python +[`zarrs_tools`]: https://github.com/LDeakin/zarrs_tools +[`ome_zarr_metadata`]: https://github.com/LDeakin/rust_ome_zarr_metadata +[`object_store`]: https://github.com/apache/arrow-rs/tree/main/object_store +[`opendal`]: https://github.com/apache/OpenDAL +[`icechunk`]: https://github.com/earth-mover/icechunk + +[Zarr]: https://zarr.dev +[Zarr V3]: https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html +[Zarr V2]: https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html +[OME-Zarr]: https://ngff.openmicroscopy.org/latest/ + +[Department of Materials Physics, Australian National University, Canberra, Australia]: https://physics.anu.edu.au/research/mp/ diff --git a/zarrs/Cargo.toml b/zarrs/Cargo.toml index d3c3168a..881b5493 100644 --- a/zarrs/Cargo.toml +++ b/zarrs/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "zarrs" -version = "0.19.0" +version = "0.20.0-dev" authors = ["Lachlan Deakin "] edition = "2021" rust-version = "1.82" @@ -52,13 +52,13 @@ derive_more = { version = "1.0.0", features = ["deref", "display", "from"] } flate2 = { version = "1.0.30", optional = true } futures = { version = "0.3.29", optional = true } gdeflate-sys = { version = "0.4.1", optional = true } -half = { version = "2.0.0", features = ["bytemuck"] } +half = { workspace = true } inventory = "0.3.0" itertools = "0.14.0" -lru = "0.12.4" +lru = "0.13.0" moka = { version = "0.12.8", features = ["sync"] } ndarray = { version = ">=0.15.0,<17", optional = true } -num = { version = "0.4.1" } +num = { workspace = true } pco = { version = "0.4.0", optional = true } rayon = "1.10.0" rayon_iter_concurrent_limit = "0.2.0" @@ -68,6 +68,7 @@ thiserror = "2.0.0" thread_local = "1.1.8" unsafe_cell_slice = "0.2.0" zarrs_filesystem = { workspace = true, optional = true } +zarrs_data_type = { workspace = true } zarrs_metadata = { workspace = true } zarrs_storage = { workspace = true } zfp-sys = {version = "0.3.0", features = ["static"], optional = true } diff --git a/zarrs/doc/ecosystem.md b/zarrs/doc/ecosystem.md index a048289d..db528a37 100644 --- a/zarrs/doc/ecosystem.md +++ b/zarrs/doc/ecosystem.md @@ -1,57 +1,48 @@ -| Crate | Docs / Description | -| --------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- | -| **Core** | | -| [![zarrs_ver]](https://crates.io/crates/zarrs) [zarrs] | [![docs]](https://docs.rs/zarrs) The core library for manipulating Zarr hierarchies | -| [![zarrs_metadata_ver]](https://crates.io/crates/zarrs_metadata) [zarrs_metadata] | [![docs]](https://docs.rs/zarrs_metadata) Zarr metadata support (re-exported as `zarrs::metadata`) | -| [![zarrs_storage_ver]](https://crates.io/crates/zarrs_storage) [zarrs_storage] | [![docs]](https://docs.rs/zarrs_storage) The storage API for `zarrs` (re-exported as `zarrs::storage`) | -| **Stores** | | -| [![zarrs_filesystem_ver]](https://crates.io/crates/zarrs_filesystem) [zarrs_filesystem] | [![docs]](https://docs.rs/zarrs_filesystem) A filesystem store (re-exported as `zarrs::filesystem`) | -| [![zarrs_object_store_ver]](https://crates.io/crates/zarrs_object_store) [zarrs_object_store] | [![docs]](https://docs.rs/zarrs_object_store) [`object_store`](https://docs.rs/object_store/latest/object_store/) store support | -| [![zarrs_opendal_ver]](https://crates.io/crates/zarrs_opendal) [zarrs_opendal] | [![docs]](https://docs.rs/zarrs_opendal) [`opendal`](https://docs.rs/opendal/latest/opendal/) store support | -| [![zarrs_http_ver]](https://crates.io/crates/zarrs_http) [zarrs_http] | [![docs]](https://docs.rs/zarrs_http) A synchronous http store | -| [![zarrs_zip_ver]](https://crates.io/crates/zarrs_zip) [zarrs_zip] | [![docs]](https://docs.rs/zarrs_zip) A storage adapter for zip files | -| [![zarrs_icechunk_ver]](https://crates.io/crates/zarrs_icechunk) [zarrs_icechunk] | [![docs]](https://docs.rs/zarrs_icechunk) [`icechunk`](https://docs.rs/icechunk/latest/icechunk/) store support | -| **Bindings** | | -| [![zarrs_python_ver]](https://pypi.org/project/zarrs/) [zarrs-python] | [![docs]](https://zarrs-python.readthedocs.io/en/latest/) A codec pipeline for [zarr-python] | -| [![zarrs_ffi_ver]](https://crates.io/crates/zarrs_ffi) [zarrs_ffi] | [![docs]](https://docs.rs/zarrs_ffi) A subset of `zarrs` exposed as a C/C++ API | -| **Zarr Metadata Conventions** | | -| [![ome_zarr_metadata_ver]](https://crates.io/crates/ome_zarr_metadata) [ome_zarr_metadata] | [![docs]](https://docs.rs/ome_zarr_metadata) A library for OME-Zarr (previously OME-NGFF) metadata | +#### Core +- [`zarrs`]: The core library for manipulating Zarr hierarchies. +- [`zarrs_data_type`]: Zarr data types (re-exported as `zarrs::data_type`). +- [`zarrs_metadata`]: Zarr metadata support (re-exported as `zarrs::metadata`). +- [`zarrs_storage`]: The storage API for `zarrs` (re-exported as `zarrs::storage`). -[docs]: https://img.shields.io/badge/docs-brightgreen -[zarrs_ver]: https://img.shields.io/crates/v/zarrs -[zarrs]: https://github.com/LDeakin/zarrs/tree/main/zarrs -[zarrs_metadata_ver]: https://img.shields.io/crates/v/zarrs_metadata -[zarrs_metadata]: https://github.com/LDeakin/zarrs/tree/main/zarrs_metadata -[zarrs_storage_ver]: https://img.shields.io/crates/v/zarrs_storage -[zarrs_storage]: https://github.com/LDeakin/zarrs/tree/main/zarrs_storage -[zarrs_filesystem_ver]: https://img.shields.io/crates/v/zarrs_filesystem -[zarrs_filesystem]: https://github.com/LDeakin/zarrs/tree/main/zarrs_filesystem -[zarrs_http_ver]: https://img.shields.io/crates/v/zarrs_http -[zarrs_http]: https://github.com/LDeakin/zarrs/tree/main/zarrs_http -[zarrs_object_store_ver]: https://img.shields.io/crates/v/zarrs_object_store -[zarrs_object_store]: https://github.com/LDeakin/zarrs/tree/main/zarrs_object_store -[zarrs_opendal_ver]: https://img.shields.io/crates/v/zarrs_opendal -[zarrs_opendal]: https://github.com/LDeakin/zarrs/tree/main/zarrs_opendal -[zarrs_zip_ver]: https://img.shields.io/crates/v/zarrs_zip -[zarrs_zip]: https://github.com/LDeakin/zarrs/tree/main/zarrs_zip -[zarrs_icechunk_ver]: https://img.shields.io/crates/v/zarrs_icechunk -[zarrs_icechunk]: https://github.com/LDeakin/zarrs_icechunk -[zarrs_ffi_ver]: https://img.shields.io/crates/v/zarrs_ffi -[zarrs_ffi]: https://github.com/LDeakin/zarrs_ffi -[zarrs_python_ver]: https://img.shields.io/pypi/v/zarrs -[zarrs-python]: https://github.com/ilan-gold/zarrs-python -[zarr-python]: https://github.com/zarr-developers/zarr-python -[ome_zarr_metadata_ver]: https://img.shields.io/crates/v/ome_zarr_metadata -[ome_zarr_metadata]: https://github.com/LDeakin/rust_ome_zarr_metadata +#### Stores +- [`zarrs_filesystem`]: A filesystem store (re-exported as `zarrs::filesystem`). +- [`zarrs_object_store`]: [`object_store`] store support. +- [`zarrs_opendal`]: [`opendal`] store support. +- [`zarrs_http`]: A synchronous http store. +- [`zarrs_zip`]: A storage adapter for zip files. +- [`zarrs_icechunk`]: [`icechunk`] store support. -#### [zarrs_tools] -[![zarrs_tools_ver]](https://crates.io/crates/zarrs_tools) [![zarrs_tools_doc]](https://docs.rs/zarrs_tools) +#### Bindings +- [`zarrs-python`]: A high-performance codec pipeline for [`zarr-python`]. +- [`zarrs_ffi`]: A subset of `zarrs` exposed as a C/C++ API. -[zarrs_tools]: https://github.com/LDeakin/zarrs_tools -[zarrs_tools_ver]: https://img.shields.io/crates/v/zarrs_tools.svg -[zarrs_tools_doc]: https://docs.rs/zarrs_tools/badge.svg +#### Zarr Metadata Conventions +- [`ome_zarr_metadata`]: A library for OME-Zarr (previously OME-NGFF) metadata. +#### Tools +- [`zarrs_tools`]: Various tools for creating and manipulating Zarr V3 data with the zarrs rust crate - A reencoder that can change codecs, chunk shape, convert Zarr V2 to V3, etc. - - Create an [OME-Zarr](https://ngff.openmicroscopy.org/latest/) hierarchy from a Zarr array. + - Create an [OME-Zarr] hierarchy from a Zarr array. - Transform arrays: crop, rescale, downsample, gradient magnitude, gaussian, noise filtering, etc. - Benchmarking tools and performance benchmarks of `zarrs`. + +[`zarrs`]: https://github.com/LDeakin/zarrs/tree/main/zarrs +[`zarrs_data_type`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_data_type +[`zarrs_metadata`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_metadata +[`zarrs_storage`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_storage +[`zarrs_filesystem`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_filesystem +[`zarrs_http`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_http +[`zarrs_object_store`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_object_store +[`zarrs_opendal`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_opendal +[`zarrs_zip`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_zip +[`zarrs_icechunk`]: https://github.com/LDeakin/zarrs_icechunk +[`zarrs_ffi`]: https://github.com/LDeakin/zarrs_ffi +[`zarrs-python`]: https://github.com/ilan-gold/zarrs-python +[`zarr-python`]: https://github.com/zarr-developers/zarr-python +[`zarrs_tools`]: https://github.com/LDeakin/zarrs_tools +[`ome_zarr_metadata`]: https://github.com/LDeakin/rust_ome_zarr_metadata +[`object_store`]: https://github.com/apache/arrow-rs/tree/main/object_store +[`opendal`]: https://github.com/apache/OpenDAL +[`icechunk`]: https://github.com/earth-mover/icechunk + +[OME-Zarr]: https://ngff.openmicroscopy.org/latest/ diff --git a/zarrs/doc/status/ZEPs.md b/zarrs/doc/status/ZEPs.md deleted file mode 100644 index fd653605..00000000 --- a/zarrs/doc/status/ZEPs.md +++ /dev/null @@ -1,14 +0,0 @@ -| [Zarr Enhancement Proposal] | Status | Zarrs | -| --------------------------------------- | -------------------------- | ------------ | -| [ZEP0001]: Zarr specification version 3 | Accepted | Full support | -| [ZEP0002]: Sharding codec | Accepted | Full support | -| Draft [ZEP0003]: Variable chunking | [zarr-developers #52] | Full support | -| Draft ZEP0007: Strings | [zarr-developers/zeps #47] | Prototype | - -[Zarr Enhancement Proposal]: https://zarr.dev/zeps/ -[ZEP0001]: https://zarr.dev/zeps/accepted/ZEP0001.html -[ZEP0002]: https://zarr.dev/zeps/accepted/ZEP0002.html -[ZEP0003]: https://zarr.dev/zeps/draft/ZEP0003.html - -[zarr-developers #52]: https://github.com/orgs/zarr-developers/discussions/52 -[zarr-developers/zeps #47]: https://github.com/zarr-developers/zeps/pull/47#issuecomment-1710505141 diff --git a/zarrs/doc/status/chunk_grids.md b/zarrs/doc/status/chunk_grids.md index 3de5024e..ff033044 100644 --- a/zarrs/doc/status/chunk_grids.md +++ b/zarrs/doc/status/chunk_grids.md @@ -1,7 +1,7 @@ -| Chunk Grid | ZEP | V3 | V2 | Feature Flag | -| ------------- | --------- | ------- | ------- | ------------ | -| [regular] | [ZEP0001] | ✓ | ✓ | | -| [rectangular] | [ZEP0003] | ✓ | | | +| Chunk Grid | ZEP | V3 | V2 | Feature Flag | +| ---------------------------- | ----------------- | ------- | ------- | ------------ | +| [regular] | [ZEP0001] | ✓ | ✓ | | +| [rectangular] (experimental) | [ZEP0003] (draft) | ✓ | | | [regular]: crate::array::chunk_grid::RegularChunkGrid [rectangular]: crate::array::chunk_grid::RectangularChunkGrid diff --git a/zarrs/doc/status/data_types.md b/zarrs/doc/status/data_types.md index af053bf6..2b4c1202 100644 --- a/zarrs/doc/status/data_types.md +++ b/zarrs/doc/status/data_types.md @@ -8,24 +8,24 @@ † Experimental data types are recommended for evaluation only. -[bool]: crate::array::data_type::DataType::Bool -[int8]: crate::array::data_type::DataType::Int8 -[int16]: crate::array::data_type::DataType::Int16 -[int32]: crate::array::data_type::DataType::Int32 -[int64]: crate::array::data_type::DataType::Int64 -[uint8]: crate::array::data_type::DataType::UInt8 -[uint16]: crate::array::data_type::DataType::UInt16 -[uint32]: crate::array::data_type::DataType::UInt32 -[uint64]: crate::array::data_type::DataType::UInt64 -[float16]: crate::array::data_type::DataType::Float16 -[float32]: crate::array::data_type::DataType::Float32 -[float64]: crate::array::data_type::DataType::Float64 -[complex64]: crate::array::data_type::DataType::Complex64 -[complex128]: crate::array::data_type::DataType::Complex128 -[bfloat16]: crate::array::data_type::DataType::BFloat16 -[r* (raw bits)]: crate::array::data_type::DataType::RawBits -[string]: crate::array::data_type::DataType::String -[bytes]: crate::array::data_type::DataType::Bytes +[bool]: crate::data_type::DataType::Bool +[int8]: crate::data_type::DataType::Int8 +[int16]: crate::data_type::DataType::Int16 +[int32]: crate::data_type::DataType::Int32 +[int64]: crate::data_type::DataType::Int64 +[uint8]: crate::data_type::DataType::UInt8 +[uint16]: crate::data_type::DataType::UInt16 +[uint32]: crate::data_type::DataType::UInt32 +[uint64]: crate::data_type::DataType::UInt64 +[float16]: crate::data_type::DataType::Float16 +[float32]: crate::data_type::DataType::Float32 +[float64]: crate::data_type::DataType::Float64 +[complex64]: crate::data_type::DataType::Complex64 +[complex128]: crate::data_type::DataType::Complex128 +[bfloat16]: crate::data_type::DataType::BFloat16 +[r* (raw bits)]: crate::data_type::DataType::RawBits +[string]: crate::data_type::DataType::String +[bytes]: crate::data_type::DataType::Bytes [ZEP0001]: https://zarr.dev/zeps/accepted/ZEP0001.html [zarr-specs #130]: https://github.com/zarr-developers/zarr-specs/issues/130 diff --git a/zarrs/src/array.rs b/zarrs/src/array.rs index e76d28db..f0ae3f96 100644 --- a/zarrs/src/array.rs +++ b/zarrs/src/array.rs @@ -23,6 +23,7 @@ mod array_builder; mod array_bytes; +mod array_bytes_fixed_disjoint_view; mod array_errors; mod array_metadata_options; mod array_representation; @@ -32,10 +33,9 @@ pub mod chunk_grid; pub mod chunk_key_encoding; pub mod codec; pub mod concurrency; -pub mod data_type; mod element; -mod fill_value; pub mod storage_transformer; +pub use crate::data_type; // re-export for zarrs < 0.20 compat #[cfg(feature = "sharding")] mod array_sharded_ext; @@ -48,7 +48,10 @@ pub use self::{ array_builder::ArrayBuilder, array_bytes::{ copy_fill_value_into, update_array_bytes, ArrayBytes, ArrayBytesError, RawBytes, - RawBytesOffsets, + RawBytesOffsets, RawBytesOffsetsCreateError, RawBytesOffsetsOutOfBoundsError, + }, + array_bytes_fixed_disjoint_view::{ + ArrayBytesFixedDisjointView, ArrayBytesFixedDisjointViewCreateError, }, array_errors::{ArrayCreateError, ArrayError}, array_metadata_options::ArrayMetadataOptions, @@ -61,11 +64,11 @@ pub use self::{ codec::ArrayCodecTraits, codec::CodecChain, concurrency::RecommendedConcurrency, - data_type::DataType, element::{Element, ElementFixedLength, ElementOwned}, - fill_value::FillValue, storage_transformer::StorageTransformerChain, }; +pub use crate::data_type::{DataType, FillValue}; // re-export for zarrs < 0.20 compat + pub use crate::metadata::v2::ArrayMetadataV2; use crate::metadata::v2_to_v3::ArrayMetadataV2ToV3ConversionError; pub use crate::metadata::v3::{ @@ -603,7 +606,7 @@ impl Array { // Codec metadata manipulation match &mut metadata { ArrayMetadata::V3(metadata) => { - metadata.codecs = self.codecs().create_metadatas_opt(options); + metadata.codecs = self.codecs().create_metadatas_opt(options.codec_options()); } ArrayMetadata::V2(_metadata) => { // NOTE: The codec related options in ArrayMetadataOptions do not impact V2 codecs @@ -910,10 +913,7 @@ pub fn elements_to_ndarray( ) -> Result, ArrayError> { let length = elements.len(); ndarray::ArrayD::::from_shape_vec(iter_u64_to_usize(shape.iter()), elements).map_err(|_| { - ArrayError::CodecError(codec::CodecError::UnexpectedChunkDecodedSize( - length * std::mem::size_of::(), - shape.iter().product::() * std::mem::size_of::() as u64, - )) + ArrayError::CodecError(codec::InvalidArrayShapeError::new(shape.to_vec(), length).into()) }) } @@ -926,7 +926,7 @@ pub fn bytes_to_ndarray( shape: &[u64], bytes: Vec, ) -> Result, ArrayError> { - let expected_len = shape.iter().product::() * core::mem::size_of::() as u64; + let expected_len = shape.iter().product::() * size_of::() as u64; if bytes.len() as u64 != expected_len { return Err(ArrayError::InvalidBytesInputSize(bytes.len(), expected_len)); } @@ -1102,12 +1102,12 @@ mod tests { ) } - #[ignore] // FIXME: Reported upstream https://github.com/zarr-developers/zarr-python/issues/2675 + #[cfg(feature = "transpose")] #[test] fn array_v2_none_f() { array_v2_to_v3( "tests/data/v2/array_none_F.zarr", - "tests/data/v3/array_none_tranpose.zarr", + "tests/data/v3/array_none_transpose.zarr", ) } @@ -1122,7 +1122,6 @@ mod tests { } #[cfg(feature = "blosc")] - #[ignore] // FIXME: Reported upstream https://github.com/zarr-developers/zarr-python/issues/2675 #[test] #[cfg_attr(miri, ignore)] fn array_v2_blosc_f() { diff --git a/zarrs/src/array/array_async_readable.rs b/zarrs/src/array/array_async_readable.rs index 79ca0f72..f418968e 100644 --- a/zarrs/src/array/array_async_readable.rs +++ b/zarrs/src/array/array_async_readable.rs @@ -18,8 +18,8 @@ use super::{ }, concurrency::concurrency_chunks_and_codec, element::ElementOwned, - Array, ArrayBytes, ArrayCreateError, ArrayError, ArrayMetadata, ArrayMetadataV2, - ArrayMetadataV3, ArraySize, DataTypeSize, + Array, ArrayBytes, ArrayBytesFixedDisjointView, ArrayCreateError, ArrayError, ArrayMetadata, + ArrayMetadataV2, ArrayMetadataV3, ArraySize, DataTypeSize, }; #[cfg(feature = "ndarray")] @@ -335,12 +335,10 @@ impl Array { } /// Async variant of [`retrieve_chunk_into`](Array::retrieve_chunk_into). - async unsafe fn async_retrieve_chunk_into( + async fn async_retrieve_chunk_into( &self, chunk_indices: &[u64], - output: &UnsafeCellSlice<'_, u8>, - output_shape: &[u64], - output_subset: &ArraySubset, + output_view: &mut ArrayBytesFixedDisjointView<'_>, options: &CodecOptions, ) -> Result<(), ArrayError> { if chunk_indices.len() != self.dimensionality() { @@ -360,29 +358,17 @@ impl Array { if let Some(chunk_encoded) = chunk_encoded { let chunk_encoded: Vec = chunk_encoded.into(); let chunk_representation = self.chunk_array_representation(chunk_indices)?; - unsafe { - self.codecs() - .decode_into( - Cow::Owned(chunk_encoded), - &chunk_representation, - output, - output_shape, - output_subset, - options, - ) - .map_err(ArrayError::CodecError) - } - } else { - unsafe { - copy_fill_value_into( - self.data_type(), - self.fill_value(), - output, - output_shape, - output_subset, + self.codecs() + .decode_into( + Cow::Owned(chunk_encoded), + &chunk_representation, + output_view, + options, ) .map_err(ArrayError::CodecError) - } + } else { + copy_fill_value_into(self.data_type(), self.fill_value(), output_view) + .map_err(ArrayError::CodecError) } } @@ -650,19 +636,25 @@ impl Array { let chunk_subset = self.chunk_subset(&chunk_indices)?; let chunk_subset_overlap = chunk_subset.overlap(array_subset)?; - unsafe { - self.async_retrieve_chunk_subset_into( - &chunk_indices, - &chunk_subset_overlap - .relative_to(chunk_subset.start())?, - &output, + + let mut output_view = unsafe { + // SAFETY: chunks represent disjoint array subsets + ArrayBytesFixedDisjointView::new_unchecked( + output, + data_type_size, array_subset.shape(), - &chunk_subset_overlap - .relative_to(array_subset.start())?, - &options, + chunk_subset_overlap + .relative_to(array_subset.start()) + .unwrap(), ) - .await?; - } + }; + self.async_retrieve_chunk_subset_into( + &chunk_indices, + &chunk_subset_overlap.relative_to(chunk_subset.start())?, + &mut output_view, + &options, + ) + .await?; // let chunk_subset_bytes = self // .async_retrieve_chunk_subset_opt( // &chunk_indices, @@ -737,7 +729,7 @@ impl Array { options: &CodecOptions, ) -> Result, ArrayError> { let chunk_representation = self.chunk_array_representation(chunk_indices)?; - if !chunk_subset.inbounds(&chunk_representation.shape_u64()) { + if !chunk_subset.inbounds_shape(&chunk_representation.shape_u64()) { return Err(ArrayError::InvalidArraySubset( chunk_subset.clone(), self.shape().to_vec(), @@ -773,17 +765,15 @@ impl Array { Ok(bytes) } - async unsafe fn async_retrieve_chunk_subset_into( + async fn async_retrieve_chunk_subset_into( &self, chunk_indices: &[u64], chunk_subset: &ArraySubset, - output: &UnsafeCellSlice<'_, u8>, - output_shape: &[u64], - output_subset: &ArraySubset, + output_view: &mut ArrayBytesFixedDisjointView<'_>, options: &CodecOptions, ) -> Result<(), ArrayError> { let chunk_representation = self.chunk_array_representation(chunk_indices)?; - if !chunk_subset.inbounds(&chunk_representation.shape_u64()) { + if !chunk_subset.inbounds_shape(&chunk_representation.shape_u64()) { return Err(ArrayError::InvalidArraySubset( chunk_subset.clone(), self.shape().to_vec(), @@ -794,16 +784,8 @@ impl Array { && chunk_subset.shape() == chunk_representation.shape_u64() { // Fast path if `chunk_subset` encompasses the whole chunk - unsafe { - self.async_retrieve_chunk_into( - chunk_indices, - output, - output_shape, - output_subset, - options, - ) + self.async_retrieve_chunk_into(chunk_indices, output_view, options) .await - } } else { let storage_handle = Arc::new(StorageHandle::new(self.storage.clone())); let storage_transformer = self @@ -815,14 +797,12 @@ impl Array { self.chunk_key(chunk_indices), )); - unsafe { - self.codecs - .clone() - .async_partial_decoder(input_handle, &chunk_representation, options) - .await? - .partial_decode_into(chunk_subset, output, output_shape, output_subset, options) - .await?; - } + self.codecs + .clone() + .async_partial_decoder(input_handle, &chunk_representation, options) + .await? + .partial_decode_into(chunk_subset, output_view, options) + .await?; Ok(()) } } diff --git a/zarrs/src/array/array_async_readable_writable.rs b/zarrs/src/array/array_async_readable_writable.rs index 4a958ede..74a3ceaa 100644 --- a/zarrs/src/array/array_async_readable_writable.rs +++ b/zarrs/src/array/array_async_readable_writable.rs @@ -158,15 +158,13 @@ impl Array &mut Self { self.additional_fields = additional_fields; self diff --git a/zarrs/src/array/array_bytes.rs b/zarrs/src/array/array_bytes.rs index 7357134b..227b8cd0 100644 --- a/zarrs/src/array/array_bytes.rs +++ b/zarrs/src/array/array_bytes.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; +use derive_more::derive::Display; use itertools::Itertools; use thiserror::Error; use unsafe_cell_slice::UnsafeCellSlice; @@ -9,25 +10,46 @@ use crate::{ indexer::IncompatibleIndexerAndShapeError, metadata::v3::array::data_type::DataTypeSize, }; -use super::{codec::CodecError, ravel_indices, ArraySize, DataType, FillValue}; +use super::{ + codec::{CodecError, InvalidBytesLengthError}, + ravel_indices, ArrayBytesFixedDisjointView, ArraySize, DataType, FillValue, +}; + +mod raw_bytes_offsets; +pub use raw_bytes_offsets::{RawBytesOffsets, RawBytesOffsetsCreateError}; /// Array element bytes. +/// +/// These can represent: +/// - [`ArrayBytes::Fixed`]: fixed length elements of an array in C-contiguous order, +/// - [`ArrayBytes::Variable`]: variable length elements of an array in C-contiguous order with padding permitted, +/// - Encoded array bytes after an array to bytes or bytes to bytes codecs. pub type RawBytes<'a> = Cow<'a, [u8]>; -/// Array element byte offsets. -pub type RawBytesOffsets<'a> = Cow<'a, [usize]>; - /// Fixed or variable length array bytes. -/// -/// Offsets are [`None`] if bytes are composed of fixed size data types. #[derive(Clone, Debug, PartialEq, Eq)] pub enum ArrayBytes<'a> { /// Bytes for a fixed length array. + /// + /// These represent elements in C-contiguous order (i.e. row-major order) where the last dimension varies the fastest. Fixed(RawBytes<'a>), /// Bytes and element byte offsets for a variable length array. + /// + /// The bytes and offsets are modeled on the [Apache Arrow Variable-size Binary Layout](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout). + /// - The offsets buffer contains length + 1 ~~signed integers (either 32-bit or 64-bit, depending on the data type)~~ usize integers. + /// - Offsets must be monotonically increasing, that is `offsets[j+1] >= offsets[j]` for `0 <= j < length`, even for null slots. Thus, the bytes represent C-contiguous elements with padding permitted. + /// - The final offset must be less than or equal to the length of the bytes buffer. Variable(RawBytes<'a>, RawBytesOffsets<'a>), } +/// An error raised if variable length array bytes offsets are out of bounds. +#[derive(Debug, Error, Display)] +#[display("Offset {offset} is out of bounds for bytes of length {len}")] +pub struct RawBytesOffsetsOutOfBoundsError { + offset: usize, + len: usize, +} + /// Errors related to [`ArrayBytes<'_>`] and [`ArrayBytes`]. #[derive(Debug, Error)] pub enum ArrayBytesError { @@ -38,16 +60,42 @@ pub enum ArrayBytesError { impl<'a> ArrayBytes<'a> { /// Create a new fixed length array bytes from `bytes`. + /// + /// `bytes` must be C-contiguous. pub fn new_flen(bytes: impl Into>) -> Self { Self::Fixed(bytes.into()) } /// Create a new variable length array bytes from `bytes` and `offsets`. + /// + /// # Errors + /// Returns a [`RawBytesOffsetsOutOfBoundsError`] if the last offset is out of bounds of the bytes. pub fn new_vlen( bytes: impl Into>, - offsets: impl Into>, + offsets: RawBytesOffsets<'a>, + ) -> Result { + let bytes = bytes.into(); + if offsets.last() <= bytes.len() { + Ok(Self::Variable(bytes, offsets)) + } else { + Err(RawBytesOffsetsOutOfBoundsError { + offset: offsets.last(), + len: bytes.len(), + }) + } + } + + /// Create a new variable length array bytes from `bytes` and `offsets` without checking the offsets. + /// + /// # Safety + /// The last offset must be less than or equal to the length of the bytes. + pub unsafe fn new_vlen_unchecked( + bytes: impl Into>, + offsets: RawBytesOffsets<'a>, ) -> Self { - Self::Variable(bytes.into(), offsets.into()) + let bytes = bytes.into(); + debug_assert!(offsets.last() <= bytes.len()); + Self::Variable(bytes, offsets) } /// Create a new [`ArrayBytes`] with `num_elements` composed entirely of the `fill_value`. @@ -66,12 +114,18 @@ impl<'a> ArrayBytes<'a> { } ArraySize::Variable { num_elements } => { let num_elements = usize::try_from(num_elements).unwrap(); - Self::new_vlen( - fill_value.as_ne_bytes().repeat(num_elements), - (0..=num_elements) - .map(|i| i * fill_value.size()) - .collect::>(), - ) + let offsets = unsafe { + // SAFETY: The offsets are monotonically increasing. + RawBytesOffsets::new_unchecked( + (0..=num_elements) + .map(|i| i * fill_value.size()) + .collect::>(), + ) + }; + unsafe { + // SAFETY: The last offset is equal to the length of the bytes + Self::new_vlen_unchecked(fill_value.as_ne_bytes().repeat(num_elements), offsets) + } } } } @@ -121,9 +175,9 @@ impl<'a> ArrayBytes<'a> { #[must_use] pub fn into_owned<'b>(self) -> ArrayBytes<'b> { match self { - Self::Fixed(bytes) => ArrayBytes::<'b>::new_flen(bytes.into_owned()), + Self::Fixed(bytes) => ArrayBytes::<'b>::Fixed(bytes.into_owned().into()), Self::Variable(bytes, offsets) => { - ArrayBytes::<'b>::new_vlen(bytes.into_owned(), offsets.into_owned()) + ArrayBytes::<'b>::Variable(bytes.into_owned().into(), offsets.into_owned()) } } } @@ -188,7 +242,15 @@ impl<'a> ArrayBytes<'a> { ss_bytes.extend_from_slice(&bytes[curr..next]); } ss_offsets.push(ss_bytes.len()); - Ok(ArrayBytes::new_vlen(ss_bytes, ss_offsets)) + let ss_offsets = unsafe { + // SAFETY: The offsets are monotonically increasing. + RawBytesOffsets::new_unchecked(ss_offsets) + }; + let array_bytes = unsafe { + // SAFETY: The last offset is equal to the length of the bytes + ArrayBytes::new_vlen_unchecked(ss_bytes, ss_offsets) + }; + Ok(array_bytes) } ArrayBytes::Fixed(bytes) => { let byte_ranges = @@ -201,14 +263,11 @@ impl<'a> ArrayBytes<'a> { } /// Validate fixed length array bytes for a given array size. -fn validate_bytes_flen(bytes: &RawBytes, array_size: u64) -> Result<(), CodecError> { - if bytes.len() as u64 == array_size { +fn validate_bytes_flen(bytes: &RawBytes, array_size: usize) -> Result<(), InvalidBytesLengthError> { + if bytes.len() == array_size { Ok(()) } else { - Err(CodecError::UnexpectedChunkDecodedSize( - bytes.len(), - array_size, - )) + Err(InvalidBytesLengthError::new(bytes.len(), array_size)) } } @@ -243,9 +302,10 @@ fn validate_bytes( data_type_size: DataTypeSize, ) -> Result<(), CodecError> { match (bytes, data_type_size) { - (ArrayBytes::Fixed(bytes), DataTypeSize::Fixed(data_type_size)) => { - validate_bytes_flen(bytes, num_elements * data_type_size as u64) - } + (ArrayBytes::Fixed(bytes), DataTypeSize::Fixed(data_type_size)) => Ok(validate_bytes_flen( + bytes, + usize::try_from(num_elements * data_type_size as u64).unwrap(), + )?), (ArrayBytes::Variable(bytes, offsets), DataTypeSize::Variable) => { validate_bytes_vlen(bytes, offsets, num_elements) } @@ -258,157 +318,126 @@ fn validate_bytes( } } -/// This function is used internally by various array/codec methods to write the bytes of a chunk subset into an output with an associated array subset. -/// This approach only works for fixed length data types. -pub(crate) fn update_bytes_flen( - output_bytes: &UnsafeCellSlice, - output_shape: &[u64], - subset_bytes: &RawBytes, - subset: &ArraySubset, - data_type_size: usize, -) { - debug_assert_eq!( - output_bytes.len(), - usize::try_from(output_shape.iter().product::()).unwrap() * data_type_size - ); - debug_assert_eq!( - subset_bytes.len(), - subset.num_elements_usize() * data_type_size, - ); - - let contiguous_indices = - unsafe { subset.contiguous_linearised_indices_unchecked(output_shape) }; - let length = contiguous_indices.contiguous_elements_usize() * data_type_size; - let mut decoded_offset = 0; - // TODO: Par iteration? - for array_subset_element_index in &contiguous_indices { - let output_offset = usize::try_from(array_subset_element_index).unwrap() * data_type_size; - debug_assert!((output_offset + length) <= output_bytes.len()); - debug_assert!((decoded_offset + length) <= subset_bytes.len()); - unsafe { - output_bytes - .index_mut(output_offset..output_offset + length) - .copy_from_slice(&subset_bytes[decoded_offset..decoded_offset + length]); - } - decoded_offset += length; +pub(crate) fn update_bytes_vlen<'a>( + input_bytes: &RawBytes, + input_offsets: &RawBytesOffsets, + input_shape: &[u64], + update_bytes: &RawBytes, + update_offsets: &RawBytesOffsets, + update_subset: &ArraySubset, +) -> Result, IncompatibleArraySubsetAndShapeError> { + if !update_subset.inbounds_shape(input_shape) { + return Err(IncompatibleArraySubsetAndShapeError::new( + update_subset.clone(), + input_shape.to_vec(), + )); } -} -pub(crate) fn update_bytes_vlen<'a>( - output_bytes: &RawBytes, - output_offsets: &RawBytesOffsets, - output_shape: &[u64], - subset_bytes: &RawBytes, - subset_offsets: &RawBytesOffsets, - subset: &ArraySubset, -) -> ArrayBytes<'a> { // Get the current and new length of the bytes in the chunk subset - let size_subset_new = { - let chunk_subset_indices = ArraySubset::new_with_shape(subset.shape().to_vec()) - .linearised_indices(subset.shape()) - .unwrap(); - chunk_subset_indices - .iter() - .map(|index| { - let index = usize::try_from(index).unwrap(); - subset_offsets[index + 1] - subset_offsets[index] - }) - .sum::() - }; + let size_subset_new = update_offsets + .iter() + .tuple_windows() + .map(|(curr, next)| next - curr) + .sum::(); let size_subset_old = { - let chunk_indices = subset.linearised_indices(output_shape).unwrap(); + let chunk_indices = update_subset.linearised_indices(input_shape).unwrap(); chunk_indices .iter() .map(|index| { let index = usize::try_from(index).unwrap(); - output_offsets[index + 1] - output_offsets[index] + input_offsets[index + 1] - input_offsets[index] }) .sum::() }; // Populate new offsets and bytes - let mut offsets_new = Vec::with_capacity(output_offsets.len()); - let bytes_new_len = (output_bytes.len() + size_subset_new) + let mut offsets_new = Vec::with_capacity(input_offsets.len()); + let bytes_new_len = (input_bytes.len() + size_subset_new) .checked_sub(size_subset_old) .unwrap(); let mut bytes_new = Vec::with_capacity(bytes_new_len); - let indices = ArraySubset::new_with_shape(output_shape.to_vec()).indices(); + let indices = ArraySubset::new_with_shape(input_shape.to_vec()).indices(); for (chunk_index, indices) in indices.iter().enumerate() { offsets_new.push(bytes_new.len()); - if subset.contains(&indices) { + if update_subset.contains(&indices) { let subset_indices = indices .iter() - .zip(subset.start()) + .zip(update_subset.start()) .map(|(i, s)| i - s) .collect::>(); let subset_index = - usize::try_from(ravel_indices(&subset_indices, subset.shape())).unwrap(); - let start = subset_offsets[subset_index]; - let end = subset_offsets[subset_index + 1]; - bytes_new.extend_from_slice(&subset_bytes[start..end]); + usize::try_from(ravel_indices(&subset_indices, update_subset.shape())).unwrap(); + let start = update_offsets[subset_index]; + let end = update_offsets[subset_index + 1]; + bytes_new.extend_from_slice(&update_bytes[start..end]); } else { - let start = output_offsets[chunk_index]; - let end = output_offsets[chunk_index + 1]; - bytes_new.extend_from_slice(&output_bytes[start..end]); + let start = input_offsets[chunk_index]; + let end = input_offsets[chunk_index + 1]; + bytes_new.extend_from_slice(&input_bytes[start..end]); } } offsets_new.push(bytes_new.len()); - - ArrayBytes::new_vlen(bytes_new, offsets_new) + let offsets_new = unsafe { + // SAFETY: The offsets are monotonically increasing. + RawBytesOffsets::new_unchecked(offsets_new) + }; + let array_bytes = unsafe { + // SAFETY: The last offset is equal to the length of the bytes + ArrayBytes::new_vlen_unchecked(bytes_new, offsets_new) + }; + Ok(array_bytes) } /// Update a subset of an array. /// /// This function is used internally by [`crate::array::Array::store_chunk_subset_opt`] and [`crate::array::Array::async_store_chunk_subset_opt`]. /// -/// # Safety -/// The caller must ensure that: -/// - `output_bytes` is an array with `output_shape` and `data_type_size`, -/// - `output_subset_bytes` is an array with the shape of `output_subset` and `data_type_size`, -/// - `output_subset` is within the bounds of `output_shape`, and -/// - `output_bytes` and `output_subset_bytes` are compatible (e.g. both fixed or both variable sized). -#[must_use] -pub unsafe fn update_array_bytes<'a>( +/// # Errors +/// Returns a [`CodecError`] if +/// - `output_bytes` are not compatible with the `output_shape` and `data_type_size`, +/// - `output_subset_bytes` are not compatible with the `output_subset` and `data_type_size`, +/// - `output_subset` is not within the bounds of `output_shape` +pub fn update_array_bytes<'a>( output_bytes: ArrayBytes, output_shape: &[u64], output_subset: &ArraySubset, output_subset_bytes: &ArrayBytes, data_type_size: DataTypeSize, -) -> ArrayBytes<'a> { +) -> Result, CodecError> { match (output_bytes, output_subset_bytes, data_type_size) { ( ArrayBytes::Variable(chunk_bytes, chunk_offsets), ArrayBytes::Variable(chunk_subset_bytes, chunk_subset_offsets), DataTypeSize::Variable, - ) => update_bytes_vlen( + ) => Ok(update_bytes_vlen( &chunk_bytes, &chunk_offsets, output_shape, chunk_subset_bytes, chunk_subset_offsets, output_subset, - ), + )?), ( ArrayBytes::Fixed(chunk_bytes), ArrayBytes::Fixed(chunk_subset_bytes), DataTypeSize::Fixed(data_type_size), ) => { let mut chunk_bytes = chunk_bytes.into_owned(); - { - let chunk_bytes = UnsafeCellSlice::new(&mut chunk_bytes); - update_bytes_flen( - &chunk_bytes, - output_shape, - chunk_subset_bytes, - output_subset, + let mut output_view = unsafe { + // SAFETY: Only one view is created, so it is disjoint + ArrayBytesFixedDisjointView::new( + UnsafeCellSlice::new(&mut chunk_bytes), data_type_size, - ); + output_shape, + output_subset.clone(), + ) } - ArrayBytes::new_flen(chunk_bytes) - } - (_, _, _) => { - unreachable!("Validation should occur outside of this function") + .map_err(CodecError::from)?; + output_view.copy_from_slice(chunk_subset_bytes)?; + Ok(ArrayBytes::new_flen(chunk_bytes)) } + (_, _, DataTypeSize::Variable) => Err(CodecError::ExpectedVariableLengthBytes), + (_, _, DataTypeSize::Fixed(_)) => Err(CodecError::ExpectedFixedLengthBytes), } } @@ -459,10 +488,14 @@ pub(crate) fn merge_chunks_vlen<'a>( *acc += sz; Some(*acc) })); + let offsets = unsafe { + // SAFETY: The offsets are monotonically increasing. + RawBytesOffsets::new_unchecked(offsets) + }; // Write bytes // TODO: Go parallel - let mut bytes = vec![0; *offsets.last().unwrap()]; + let mut bytes = vec![0; offsets.last()]; for (chunk_bytes, chunk_subset) in chunk_bytes_and_subsets { let (chunk_bytes, chunk_offsets) = chunk_bytes.into_variable()?; let indices = chunk_subset.linearised_indices(array_shape).unwrap(); @@ -476,7 +509,12 @@ pub(crate) fn merge_chunks_vlen<'a>( } } - Ok(ArrayBytes::new_vlen(bytes, offsets)) + let array_bytes = unsafe { + // SAFETY: The last offset is equal to the length of the bytes + ArrayBytes::new_vlen_unchecked(bytes, offsets) + }; + + Ok(array_bytes) } pub(crate) fn extract_decoded_regions_vlen<'a>( @@ -506,7 +544,15 @@ pub(crate) fn extract_decoded_regions_vlen<'a>( region_bytes.extend_from_slice(&bytes[curr..next]); } region_offsets.push(region_bytes.len()); - out.push(ArrayBytes::new_vlen(region_bytes, region_offsets)); + let region_offsets = unsafe { + // SAFETY: The offsets are monotonically increasing. + RawBytesOffsets::new_unchecked(region_offsets) + }; + let array_bytes = unsafe { + // SAFETY: The last offset is equal to the length of the bytes + ArrayBytes::new_vlen_unchecked(region_bytes, region_offsets) + }; + out.push(array_bytes); } Ok(out) } @@ -524,25 +570,15 @@ pub(crate) fn extract_decoded_regions_vlen<'a>( /// - `data_type` and `fill_value` are compatible, /// - `output` holds enough space for the preallocated bytes of an array with `output_shape` and `data_type`, and /// - `output_subset` is within the bounds of `output_shape`. -pub unsafe fn copy_fill_value_into( +pub fn copy_fill_value_into( data_type: &DataType, fill_value: &FillValue, - output: &UnsafeCellSlice, - output_shape: &[u64], - output_subset: &ArraySubset, + output_view: &mut ArrayBytesFixedDisjointView, ) -> Result<(), CodecError> { - let array_size = ArraySize::new(data_type.size(), output_subset.num_elements()); - if let (ArrayBytes::Fixed(fill_value_bytes), Some(data_type_size)) = ( - ArrayBytes::new_fill_value(array_size, fill_value), - data_type.fixed_size(), - ) { - update_bytes_flen( - output, - output_shape, - &fill_value_bytes, - output_subset, - data_type_size, - ); + let array_size = ArraySize::new(data_type.size(), output_view.num_elements()); + if let ArrayBytes::Fixed(fill_value_bytes) = ArrayBytes::new_fill_value(array_size, fill_value) + { + output_view.copy_from_slice(&fill_value_bytes)?; Ok(()) } else { // TODO: Variable length data type support? @@ -602,7 +638,6 @@ impl<'a, const N: usize> From<&'a [u8; N]> for ArrayBytes<'a> { #[cfg(test)] mod tests { use std::error::Error; - use std::mem::size_of; use crate::array::Element; @@ -620,6 +655,17 @@ mod tests { Ok(()) } + #[test] + fn array_bytes_vlen() { + let data = [0u8, 1, 2, 3, 4]; + assert!(ArrayBytes::new_vlen(&data, vec![0].try_into().unwrap()).is_ok()); + assert!(ArrayBytes::new_vlen(&data, vec![0, 5].try_into().unwrap()).is_ok()); + assert!(ArrayBytes::new_vlen(&data, vec![0, 5, 5].try_into().unwrap()).is_ok()); + assert!(ArrayBytes::new_vlen(&data, vec![0, 5, 6].try_into().unwrap()).is_err()); + assert!(ArrayBytes::new_vlen(&data, vec![0, 1, 3, 5].try_into().unwrap()).is_ok()); + assert!(ArrayBytes::new_vlen(&data, vec![0, 1, 3, 6].try_into().unwrap()).is_err()); + } + #[test] fn array_bytes_str() -> Result<(), Box> { let data = ["a", "bb", "ccc"]; @@ -638,21 +684,27 @@ mod tests { let mut bytes_array = vec![0u8; 4 * 4]; { let bytes_array = UnsafeCellSlice::new(&mut bytes_array); - update_bytes_flen( - &bytes_array, - &vec![4, 4], - &vec![1u8, 2].into(), - &ArraySubset::new_with_ranges(&[1..2, 1..3]), - 1, - ); - - update_bytes_flen( - &bytes_array, - &vec![4, 4], - &vec![3u8, 4].into(), - &ArraySubset::new_with_ranges(&[3..4, 0..2]), - 1, - ); + let mut output_non_overlapping_0 = unsafe { + // SAFETY: Only one view is created, so it is disjoint + ArrayBytesFixedDisjointView::new_unchecked( + bytes_array, + size_of::(), + &[4, 4], + ArraySubset::new_with_ranges(&[1..2, 1..3]), + ) + }; + output_non_overlapping_0.copy_from_slice(&[1u8, 2]).unwrap(); + + let mut output_non_overlapping_1 = unsafe { + // SAFETY: Only one view is created, so it is disjoint + ArrayBytesFixedDisjointView::new_unchecked( + bytes_array, + size_of::(), + &[4, 4], + ArraySubset::new_with_ranges(&[3..4, 0..2]), + ) + }; + output_non_overlapping_1.copy_from_slice(&[3u8, 4]).unwrap(); } debug_assert_eq!( diff --git a/zarrs/src/array/array_bytes/raw_bytes_offsets.rs b/zarrs/src/array/array_bytes/raw_bytes_offsets.rs new file mode 100644 index 00000000..3de578f3 --- /dev/null +++ b/zarrs/src/array/array_bytes/raw_bytes_offsets.rs @@ -0,0 +1,128 @@ +use std::{borrow::Cow, ops::Deref}; + +use derive_more::derive::Display; +use thiserror::Error; + +/// Array element byte offsets. +/// +/// These must be monotonically increasing. See [`ArrayBytes::Variable`](crate::array::ArrayBytes::Variable). +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct RawBytesOffsets<'a>(Cow<'a, [usize]>); + +impl Deref for RawBytesOffsets<'_> { + type Target = [usize]; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +/// An error creating [`RawBytesOffsets`]. +#[derive(Debug, Error, Display)] +pub enum RawBytesOffsetsCreateError { + /// The offsets length must be greater than zero. + #[display("offsets length must be greater than zero")] + ZeroLength, + /// The offsets are not monotonically increasing. + #[display("offsets are not monotonically increasing")] + NotMonotonicallyIncreasing, +} + +impl<'a> RawBytesOffsets<'a> { + /// Creates a new `RawBytesOffsets`. + /// + /// # Errors + /// Returns an error if the offsets are not monotonically increasing. + pub fn new(offsets: impl Into>) -> Result { + let offsets = offsets.into(); + if offsets.is_empty() { + Err(RawBytesOffsetsCreateError::ZeroLength) + } else if offsets.windows(2).all(|w| w[1] >= w[0]) { + Ok(Self(offsets)) + } else { + Err(RawBytesOffsetsCreateError::NotMonotonicallyIncreasing) + } + } + + /// Creates a new `RawBytesOffsets` without checking the offsets. + /// + /// # Safety + /// The offsets must be monotonically increasing. + #[must_use] + pub unsafe fn new_unchecked(offsets: impl Into>) -> Self { + let offsets = offsets.into(); + debug_assert!(!offsets.is_empty()); + debug_assert!(offsets.windows(2).all(|w| w[1] >= w[0])); + Self(offsets) + } + + /// Clones the offsets if not already owned. + #[must_use] + pub fn into_owned(self) -> RawBytesOffsets<'static> { + RawBytesOffsets(self.0.into_owned().into()) + } + + /// Returns the last offset. + #[must_use] + pub fn last(&self) -> usize { + unsafe { + // SAFETY: The offsets cannot be empty. + *self.0.last().unwrap_unchecked() + } + } +} + +impl<'a> TryFrom> for RawBytesOffsets<'a> { + type Error = RawBytesOffsetsCreateError; + + fn try_from(value: Cow<'a, [usize]>) -> Result { + Self::new(value) + } +} + +impl<'a> TryFrom<&'a [usize]> for RawBytesOffsets<'a> { + type Error = RawBytesOffsetsCreateError; + + fn try_from(value: &'a [usize]) -> Result { + Self::new(value) + } +} + +impl<'a, const N: usize> TryFrom<&'a [usize; N]> for RawBytesOffsets<'a> { + type Error = RawBytesOffsetsCreateError; + + fn try_from(value: &'a [usize; N]) -> Result { + Self::new(value) + } +} + +impl TryFrom> for RawBytesOffsets<'_> { + type Error = RawBytesOffsetsCreateError; + + fn try_from(value: Vec) -> Result { + Self::new(value) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn raw_bytes_offsets() { + let offsets = RawBytesOffsets::new(vec![0, 1, 2, 3]).unwrap(); + assert_eq!(&*offsets, &[0, 1, 2, 3]); + assert!(RawBytesOffsets::new(vec![]).is_err()); + assert!(RawBytesOffsets::new(vec![0]).is_ok()); + assert!(RawBytesOffsets::new(vec![10]).is_ok()); // nonsense, but not invalid + assert!(RawBytesOffsets::new(vec![0, 1, 1]).is_ok()); + assert!(RawBytesOffsets::new(vec![0, 1, 0]).is_err()); + assert!(RawBytesOffsets::try_from(vec![0, 1, 2]).is_ok()); + assert!(RawBytesOffsets::try_from(vec![0, 1, 0]).is_err()); + assert!(RawBytesOffsets::try_from([0, 1, 2].as_slice()).is_ok()); + assert!(RawBytesOffsets::try_from([0, 1, 0].as_slice()).is_err()); + assert!(RawBytesOffsets::try_from(&[0, 1, 2]).is_ok()); + assert!(RawBytesOffsets::try_from(&[0, 1, 0]).is_err()); + assert!(RawBytesOffsets::try_from(Cow::Owned(vec![0, 1, 0])).is_err()); + } +} diff --git a/zarrs/src/array/array_bytes_fixed_disjoint_view.rs b/zarrs/src/array/array_bytes_fixed_disjoint_view.rs new file mode 100644 index 00000000..7e5caf8c --- /dev/null +++ b/zarrs/src/array/array_bytes_fixed_disjoint_view.rs @@ -0,0 +1,354 @@ +use derive_more::derive::Display; +use thiserror::Error; +use unsafe_cell_slice::UnsafeCellSlice; + +use crate::array_subset::{ + iterators::{ContiguousIndices, ContiguousLinearisedIndices}, + ArraySubset, +}; + +use super::codec::{CodecError, InvalidBytesLengthError, SubsetOutOfBoundsError}; + +/// A disjoint view of the bytes in an array with a fixed-length data type. +/// +/// The `subset` represented by this view must not overlap with the `subset` of any other created views that reference the same array bytes. +pub struct ArrayBytesFixedDisjointView<'a> { + bytes: UnsafeCellSlice<'a, u8>, + data_type_size: usize, + shape: &'a [u64], + subset: ArraySubset, + bytes_in_subset_len: usize, +} + +/// Errors that can occur when creating a [`ArrayBytesFixedDisjointView`]. +#[derive(Debug, Error, Display)] +pub enum ArrayBytesFixedDisjointViewCreateError { + /// The subset is out-of-bounds of the array shape. + SubsetOutOfBounds(#[from] SubsetOutOfBoundsError), + /// The length of the bytes is not the correct length. + InvalidBytesLength(#[from] InvalidBytesLengthError), +} + +impl From for CodecError { + fn from(value: ArrayBytesFixedDisjointViewCreateError) -> Self { + match value { + ArrayBytesFixedDisjointViewCreateError::SubsetOutOfBounds(e) => e.into(), + ArrayBytesFixedDisjointViewCreateError::InvalidBytesLength(e) => e.into(), + } + } +} + +impl<'a> ArrayBytesFixedDisjointView<'a> { + /// Create a new non-overlapping view of the bytes in an array. + /// + /// # Errors + /// Returns [`ArrayBytesFixedDisjointViewCreateError`] if + /// - `subset` is out-of-bounds of `shape`, or + /// - the length of `bytes` is not the product of the elements in `shape` multiplied by `data_type_size`. + /// + /// # Safety + /// The `subset` represented by this view must not overlap with the `subset` of any other created views that reference the same array bytes. + /// + /// # Panics + /// Panics if the product of the elements in `shape` multiplied by `data_type_size` exceeds [`usize::MAX`]. + pub unsafe fn new( + bytes: UnsafeCellSlice<'a, u8>, + data_type_size: usize, + shape: &'a [u64], + subset: ArraySubset, + ) -> Result { + if !subset.inbounds_shape(shape) { + let bounding_subset = ArraySubset::new_with_shape(shape.to_vec()); + return Err(SubsetOutOfBoundsError::new(subset, bounding_subset).into()); + } + let bytes_in_array_len = + usize::try_from(shape.iter().product::()).unwrap() * data_type_size; + if bytes.len() != bytes_in_array_len { + return Err(InvalidBytesLengthError::new(bytes.len(), bytes_in_array_len).into()); + } + + let bytes_in_subset_len = subset.num_elements_usize() * data_type_size; + Ok(Self { + bytes, + data_type_size, + shape, + subset, + bytes_in_subset_len, + }) + } + + /// Create a new non-overlapping view of the bytes in an array. + /// + /// # Safety + /// - `subset` must be inbounds of `shape`, + /// - the length of `bytes` must be the product of the elements in `shape` multiplied by `data_type_size`, and + /// - the `subset` represented by this view must not overlap with the `subset` of any other created views that reference the same array bytes. + /// + /// # Panics + /// Panics if the product of the elements in `shape` multiplied by `data_type_size` exceeds [`usize::MAX`]. + #[must_use] + pub unsafe fn new_unchecked( + bytes: UnsafeCellSlice<'a, u8>, + data_type_size: usize, + shape: &'a [u64], + subset: ArraySubset, + ) -> Self { + debug_assert!(subset.inbounds_shape(shape)); + debug_assert_eq!( + bytes.len(), + usize::try_from(shape.iter().product::()).unwrap() * data_type_size + ); + + let bytes_in_subset_len = subset.num_elements_usize() * data_type_size; + Self { + bytes, + data_type_size, + shape, + subset, + bytes_in_subset_len, + } + } + + /// Create a new non-overlapping view of the bytes in an array that is a subset of the current view. + /// + /// # Errors + /// Returns [`SubsetOutOfBoundsError`] if `subset` is out-of-bounds of the parent subset. + /// + /// # Safety + /// The `subset` represented by this view must not overlap with the `subset` of any other created views that reference the same array bytes. + pub unsafe fn subdivide( + &self, + subset: ArraySubset, + ) -> Result, SubsetOutOfBoundsError> { + if !subset.inbounds(&self.subset) { + return Err(SubsetOutOfBoundsError::new(subset, self.subset.clone())); + } + + Ok(unsafe { + // SAFETY: all inputs have been validated + Self::new_unchecked(self.bytes, self.data_type_size, self.shape, subset) + }) + } + + /// Create a new non-overlapping view of the bytes in an array that is a subset of the current view. + /// + /// # Safety + /// - `subset` must be inbounds of the parent subset, and + /// - the `subset` represented by this view must not overlap with the `subset` of any other created views that reference the same array bytes. + #[must_use] + pub unsafe fn subdivide_unchecked( + &self, + subset: ArraySubset, + ) -> ArrayBytesFixedDisjointView<'a> { + debug_assert!(subset.inbounds(&self.subset)); + + unsafe { Self::new_unchecked(self.bytes, self.data_type_size, self.shape, subset) } + } + + /// Return the shape of the bytes this view is created from. + #[must_use] + pub fn shape(&self) -> &[u64] { + self.shape + } + + /// Return the subset of the bytes this view is created from. + #[must_use] + pub fn subset(&self) -> &ArraySubset { + &self.subset + } + + /// Return the number of elements in the view. + #[must_use] + pub fn num_elements(&self) -> u64 { + self.subset.num_elements() + } + + fn contiguous_indices(&self) -> ContiguousIndices { + unsafe { + // SAFETY: the output shape encapsulates the output subset, checked in constructor + self.subset.contiguous_indices_unchecked(self.shape) + } + } + + fn contiguous_linearised_indices(&self) -> ContiguousLinearisedIndices { + unsafe { + // SAFETY: the output shape encapsulates the output subset, checked in constructor + self.subset + .contiguous_linearised_indices_unchecked(self.shape) + } + } + + /// Return the contiguous element length of the view. + /// + /// This is the number of elements that are accessed in a single contiguous block. + #[must_use] + pub fn num_contiguous_elements(&self) -> usize { + self.contiguous_indices().contiguous_elements_usize() + } + + /// Return the size in bytes of contiguous elements in the view. + /// + /// This is the number of elements that are accessed in a single contiguous block. + #[must_use] + pub fn contiguous_bytes_len(&self) -> usize { + self.contiguous_indices().contiguous_elements_usize() * self.data_type_size + } + + /// Fill the view with the fill value. + /// + /// # Errors + /// Returns [`InvalidBytesLengthError`] if the length of the `fill_value` does not match the data type size. + /// + /// # Panics + /// Panics if an offset into the internal bytes reference exceeds [`usize::MAX`]. + pub fn fill(&mut self, fill_value: &[u8]) -> Result<(), InvalidBytesLengthError> { + if fill_value.len() != self.data_type_size { + return Err(InvalidBytesLengthError::new( + fill_value.len(), + self.data_type_size, + )); + } + + let fill_value_contiguous = fill_value.repeat(self.num_contiguous_elements()); + let length = self.contiguous_bytes_len(); + debug_assert_eq!(fill_value_contiguous.len(), length); + let contiguous_indices = self.contiguous_linearised_indices(); + contiguous_indices.into_iter().for_each(|index| { + let offset = usize::try_from(index * self.data_type_size as u64).unwrap(); + unsafe { + self.bytes + .index_mut(offset..offset + length) + .copy_from_slice(&fill_value_contiguous); + } + }); + Ok(()) + } + + /// Copy bytes into the view. + /// + /// The `subset_bytes` must be the same length as the byte length of the elements in the view. + /// + /// # Errors + /// Returns an [`InvalidBytesLengthError`] if the length of `subset_bytes` is not the same as the byte length of the elements in the view. + /// + /// # Panics + /// Panics if an offset into the internal bytes reference exceeds [`usize::MAX`]. + pub fn copy_from_slice(&mut self, subset_bytes: &[u8]) -> Result<(), InvalidBytesLengthError> { + if subset_bytes.len() != self.bytes_in_subset_len { + return Err(InvalidBytesLengthError::new( + subset_bytes.len(), + self.bytes_in_subset_len, + )); + } + + let contiguous_indices = self.contiguous_linearised_indices(); + let length = contiguous_indices.contiguous_elements_usize() * self.data_type_size; + + let bytes_copied = contiguous_indices.into_iter().fold( + 0, + |subset_offset: usize, array_subset_element_index: u64| { + let output_offset = + usize::try_from(array_subset_element_index).unwrap() * self.data_type_size; + debug_assert!((output_offset + length) <= self.bytes.len()); + debug_assert!((subset_offset + length) <= subset_bytes.len()); + let subset_offset_end = subset_offset + length; + unsafe { + self.bytes + .index_mut(output_offset..output_offset + length) + .copy_from_slice(&subset_bytes[subset_offset..subset_offset_end]); + } + subset_offset_end + }, + ); + debug_assert_eq!(bytes_copied, subset_bytes.len()); + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn disjoint_view() { + let mut bytes = (0..9).collect::>(); + let shape = vec![3, 3]; + { + let bytes = UnsafeCellSlice::new(&mut bytes); + + assert!(unsafe { + ArrayBytesFixedDisjointView::new( + bytes, + 1, + &[10, 10], + ArraySubset::new_with_ranges(&[0..2, 1..3]), + ) + } + .is_err()); // incompatible shape + assert!(unsafe { + ArrayBytesFixedDisjointView::new( + bytes, + 2, + &shape, + ArraySubset::new_with_ranges(&[0..2, 1..3]), + ) + } + .is_err()); // invalid bytes length + assert!(unsafe { + ArrayBytesFixedDisjointView::new( + bytes, + 1, + &shape, + ArraySubset::new_with_ranges(&[0..2, 1..10]), + ) + } + .is_err()); // OOB + + let mut view0 = unsafe { + ArrayBytesFixedDisjointView::new( + bytes, + 1, + &shape, + ArraySubset::new_with_ranges(&[0..2, 1..3]), + ) + } + .unwrap(); + assert_eq!(view0.shape(), shape); + + view0.copy_from_slice(&[11, 12, 14, 15]).unwrap(); + assert!(view0.copy_from_slice(&[11, 12, 14, 15, 255]).is_err()); // wrong length + + let mut view0a = + unsafe { view0.subdivide(ArraySubset::new_with_ranges(&[1..2, 1..3])) }.unwrap(); + view0a.copy_from_slice(&[24, 25]).unwrap(); + assert!(view0a.copy_from_slice(&[]).is_err()); // wrong length + + assert!( + unsafe { view0a.subdivide(ArraySubset::new_with_ranges(&[1..2, 1..3])) }.is_ok() + ); + assert!( + unsafe { view0a.subdivide(ArraySubset::new_with_ranges(&[1..2, 2..3])) }.is_ok() + ); + assert!( + unsafe { view0a.subdivide(ArraySubset::new_with_ranges(&[0..2, 1..3])) }.is_err() + ); // OOB + assert!( + unsafe { view0a.subdivide(ArraySubset::new_with_ranges(&[1..2, 1..4])) }.is_err() + ); // OOB + + let mut view1 = unsafe { + ArrayBytesFixedDisjointView::new( + bytes, + 1, + &shape, + ArraySubset::new_with_ranges(&[2..3, 1..3]), + ) + } + .unwrap(); + view1.fill(&[255]).unwrap(); + assert!(view1.fill(&[255, 255]).is_err()); // invalid fill value + } + assert_eq!(&bytes, &[0, 11, 12, 3, 24, 25, 6, 255, 255]); + } +} diff --git a/zarrs/src/array/array_errors.rs b/zarrs/src/array/array_errors.rs index 527d0a58..272aff53 100644 --- a/zarrs/src/array/array_errors.rs +++ b/zarrs/src/array/array_errors.rs @@ -2,19 +2,16 @@ use thiserror::Error; use crate::{ array_subset::{ArraySubset, IncompatibleDimensionalityError}, + data_type::{ + IncompatibleFillValueError, IncompatibleFillValueMetadataError, UnsupportedDataTypeError, + }, metadata::v3::UnsupportedAdditionalFieldError, node::NodePathError, plugin::PluginCreateError, storage::StorageError, }; -use super::{ - codec::CodecError, - data_type::{ - IncompatibleFillValueError, IncompatibleFillValueMetadataError, UnsupportedDataTypeError, - }, - ArrayIndices, ArrayShape, -}; +use super::{codec::CodecError, ArrayIndices, ArrayShape}; /// An array creation error. #[derive(Debug, Error)] diff --git a/zarrs/src/array/array_metadata_options.rs b/zarrs/src/array/array_metadata_options.rs index 2ec189a0..e0dafc36 100644 --- a/zarrs/src/array/array_metadata_options.rs +++ b/zarrs/src/array/array_metadata_options.rs @@ -1,9 +1,11 @@ use crate::config::{global_config, MetadataConvertVersion}; +use super::codec::CodecMetadataOptions; + /// Options for writing array metadata. #[derive(Debug, Clone)] pub struct ArrayMetadataOptions { - experimental_codec_store_metadata_if_encode_only: bool, + codec_options: CodecMetadataOptions, convert_version: MetadataConvertVersion, include_zarrs_metadata: bool, } @@ -11,7 +13,7 @@ pub struct ArrayMetadataOptions { impl Default for ArrayMetadataOptions { fn default() -> Self { Self { - experimental_codec_store_metadata_if_encode_only: false, + codec_options: CodecMetadataOptions::default(), convert_version: global_config().metadata_convert_version(), include_zarrs_metadata: global_config().include_zarrs_metadata(), } @@ -19,26 +21,16 @@ impl Default for ArrayMetadataOptions { } impl ArrayMetadataOptions { - /// Return the [experimental codec store metadata if encode only](crate::config::Config#experimental-codec-store-metadata-if-encode-only) setting. + /// Return the codec options. #[must_use] - pub fn experimental_codec_store_metadata_if_encode_only(&self) -> bool { - self.experimental_codec_store_metadata_if_encode_only + pub fn codec_options(&self) -> &CodecMetadataOptions { + &self.codec_options } - /// Set the [experimental codec store metadata if encode only](crate::config::Config#experimental-codec-store-metadata-if-encode-only) setting. + /// Return a mutable reference to the codec options. #[must_use] - pub fn with_experimental_codec_store_metadata_if_encode_only(mut self, enabled: bool) -> Self { - self.experimental_codec_store_metadata_if_encode_only = enabled; - self - } - - /// Set the [experimental codec store metadata if encode only](crate::config::Config#experimental-codec-store-metadata-if-encode-only) setting. - pub fn set_experimental_codec_store_metadata_if_encode_only( - &mut self, - enabled: bool, - ) -> &mut Self { - self.experimental_codec_store_metadata_if_encode_only = enabled; - self + pub fn codec_options_mut(&mut self) -> &mut CodecMetadataOptions { + &mut self.codec_options } /// Get the [metadata convert version](crate::config::Config#metadata-convert-version) configuration. diff --git a/zarrs/src/array/array_representation.rs b/zarrs/src/array/array_representation.rs index 2c781fc5..5f186d50 100644 --- a/zarrs/src/array/array_representation.rs +++ b/zarrs/src/array/array_representation.rs @@ -1,6 +1,7 @@ use std::num::NonZeroU64; -use super::{data_type::IncompatibleFillValueError, ArrayShape, DataType, DataTypeSize, FillValue}; +use super::{ArrayShape, DataType, DataTypeSize, FillValue}; +use crate::data_type::IncompatibleFillValueError; use derive_more::Display; /// The shape, data type, and fill value of an `array`. diff --git a/zarrs/src/array/array_sync_readable.rs b/zarrs/src/array/array_sync_readable.rs index 200e0ba6..d699e944 100644 --- a/zarrs/src/array/array_sync_readable.rs +++ b/zarrs/src/array/array_sync_readable.rs @@ -20,7 +20,8 @@ use super::{ }, concurrency::concurrency_chunks_and_codec, element::ElementOwned, - Array, ArrayCreateError, ArrayError, ArrayMetadata, ArrayMetadataV3, ArraySize, DataTypeSize, + Array, ArrayBytesFixedDisjointView, ArrayCreateError, ArrayError, ArrayMetadata, + ArrayMetadataV3, ArraySize, DataTypeSize, }; #[cfg(feature = "ndarray")] @@ -458,12 +459,10 @@ impl Array { } } - unsafe fn retrieve_chunk_into( + fn retrieve_chunk_into( &self, chunk_indices: &[u64], - output: &UnsafeCellSlice, - output_shape: &[u64], - output_subset: &ArraySubset, + output_view: &mut ArrayBytesFixedDisjointView<'_>, options: &CodecOptions, ) -> Result<(), ArrayError> { if chunk_indices.len() != self.dimensionality() { @@ -481,28 +480,17 @@ impl Array { if let Some(chunk_encoded) = chunk_encoded { let chunk_encoded: Vec = chunk_encoded.into(); let chunk_representation = self.chunk_array_representation(chunk_indices)?; - unsafe { - self.codecs().decode_into( + self.codecs() + .decode_into( Cow::Owned(chunk_encoded), &chunk_representation, - output, - output_shape, - output_subset, + output_view, options, ) - } - .map_err(ArrayError::CodecError) + .map_err(ArrayError::CodecError) } else { - unsafe { - copy_fill_value_into( - self.data_type(), - self.fill_value(), - output, - output_shape, - output_subset, - ) - } - .map_err(ArrayError::CodecError) + copy_fill_value_into(self.data_type(), self.fill_value(), output_view) + .map_err(ArrayError::CodecError) } } @@ -720,16 +708,21 @@ impl Array { let retrieve_chunk = |chunk_indices: Vec| { let chunk_subset = self.chunk_subset(&chunk_indices)?; let chunk_subset_overlap = chunk_subset.overlap(array_subset)?; - unsafe { - self.retrieve_chunk_subset_into( - &chunk_indices, - &chunk_subset_overlap.relative_to(chunk_subset.start())?, - &output, + let mut output_view = unsafe { + // SAFETY: chunks represent disjoint array subsets + ArrayBytesFixedDisjointView::new_unchecked( + output, + data_type_size, array_subset.shape(), - &chunk_subset_overlap.relative_to(array_subset.start())?, - &options, - )?; - } + chunk_subset_overlap.relative_to(array_subset.start())?, + ) + }; + self.retrieve_chunk_subset_into( + &chunk_indices, + &chunk_subset_overlap.relative_to(chunk_subset.start())?, + &mut output_view, + &options, + )?; // let chunk_subset_bytes = self.retrieve_chunk_subset_opt( // &chunk_indices, // &chunk_subset_overlap.relative_to(chunk_subset.start())?, @@ -794,7 +787,7 @@ impl Array { options: &CodecOptions, ) -> Result, ArrayError> { let chunk_representation = self.chunk_array_representation(chunk_indices)?; - if !chunk_subset.inbounds(&chunk_representation.shape_u64()) { + if !chunk_subset.inbounds_shape(&chunk_representation.shape_u64()) { return Err(ArrayError::InvalidArraySubset( chunk_subset.clone(), self.shape().to_vec(), @@ -827,17 +820,15 @@ impl Array { Ok(bytes) } - unsafe fn retrieve_chunk_subset_into( + fn retrieve_chunk_subset_into( &self, chunk_indices: &[u64], chunk_subset: &ArraySubset, - output: &UnsafeCellSlice, - output_shape: &[u64], - output_subset: &ArraySubset, + output_view: &mut ArrayBytesFixedDisjointView<'_>, options: &CodecOptions, ) -> Result<(), ArrayError> { let chunk_representation = self.chunk_array_representation(chunk_indices)?; - if !chunk_subset.inbounds(&chunk_representation.shape_u64()) { + if !chunk_subset.inbounds_shape(&chunk_representation.shape_u64()) { return Err(ArrayError::InvalidArraySubset( chunk_subset.clone(), self.shape().to_vec(), @@ -848,15 +839,7 @@ impl Array { && chunk_subset.shape() == chunk_representation.shape_u64() { // Fast path if `chunk_subset` encompasses the whole chunk - unsafe { - self.retrieve_chunk_into( - chunk_indices, - output, - output_shape, - output_subset, - options, - ) - } + self.retrieve_chunk_into(chunk_indices, output_view, options) } else { let storage_handle = Arc::new(StorageHandle::new(self.storage.clone())); let storage_transformer = self @@ -866,19 +849,10 @@ impl Array { storage_transformer, self.chunk_key(chunk_indices), )); - - unsafe { - self.codecs - .clone() - .partial_decoder(input_handle, &chunk_representation, options)? - .partial_decode_into( - chunk_subset, - output, - output_shape, - output_subset, - options, - )?; - } + self.codecs + .clone() + .partial_decoder(input_handle, &chunk_representation, options)? + .partial_decode_into(chunk_subset, output_view, options)?; Ok(()) } } diff --git a/zarrs/src/array/array_sync_readable_writable.rs b/zarrs/src/array/array_sync_readable_writable.rs index 520cb933..478f93a2 100644 --- a/zarrs/src/array/array_sync_readable_writable.rs +++ b/zarrs/src/array/array_sync_readable_writable.rs @@ -200,15 +200,13 @@ impl Array chunk_bytes_old.validate(chunk_shape.iter().product(), self.data_type().size())?; // Update the chunk - let chunk_bytes_new = unsafe { - update_array_bytes( - chunk_bytes_old, - &chunk_shape, - chunk_subset, - &chunk_subset_bytes, - self.data_type().size(), - ) - }; + let chunk_bytes_new = update_array_bytes( + chunk_bytes_old, + &chunk_shape, + chunk_subset, + &chunk_subset_bytes, + self.data_type().size(), + )?; // Store the updated chunk self.store_chunk_opt(chunk_indices, chunk_bytes_new, options) diff --git a/zarrs/src/array/array_sync_sharded_readable_ext.rs b/zarrs/src/array/array_sync_sharded_readable_ext.rs index 461df909..04ef4d4e 100644 --- a/zarrs/src/array/array_sync_sharded_readable_ext.rs +++ b/zarrs/src/array/array_sync_sharded_readable_ext.rs @@ -7,7 +7,7 @@ use zarrs_metadata::v3::array::codec::sharding::ShardingCodecConfiguration; use zarrs_storage::byte_range::ByteRange; use zarrs_storage::StorageHandle; -use super::array_bytes::{merge_chunks_vlen, update_bytes_flen}; +use super::array_bytes::merge_chunks_vlen; use super::codec::array_to_bytes::sharding::ShardingPartialDecoder; use super::codec::{CodecError, ShardingCodec}; use super::element::ElementOwned; @@ -15,7 +15,7 @@ use super::{ codec::CodecOptions, concurrency::concurrency_chunks_and_codec, Array, ArrayError, ArrayShardedExt, ChunkGrid, }; -use super::{ArrayBytes, ArraySize, DataTypeSize}; +use super::{ArrayBytes, ArrayBytesFixedDisjointView, ArraySize, DataTypeSize}; use crate::array::codec::StoragePartialDecoder; use crate::storage::ReadableStorageTraits; use crate::{array::codec::ArrayPartialDecoderTraits, array_subset::ArraySubset}; @@ -612,13 +612,18 @@ impl ArrayShardedReadableExt )? .remove(0) .into_owned(); - update_bytes_flen( - &output, - array_subset.shape(), - &bytes.into_fixed()?, - &shard_subset_overlap.relative_to(array_subset.start())?, - data_type_size, - ); + let mut output_view = unsafe { + // SAFETY: chunks represent disjoint array subsets + ArrayBytesFixedDisjointView::new_unchecked( + output, + data_type_size, + array_subset.shape(), + shard_subset_overlap.relative_to(array_subset.start())?, + ) + }; + output_view + .copy_from_slice(&bytes.into_fixed()?) + .map_err(CodecError::from)?; Ok::<_, ArrayError>(()) }; let indices = shards.indices(); diff --git a/zarrs/src/array/chunk_cache/array_chunk_cache_ext_sync.rs b/zarrs/src/array/chunk_cache/array_chunk_cache_ext_sync.rs index db3f3fee..71e4a8fa 100644 --- a/zarrs/src/array/chunk_cache/array_chunk_cache_ext_sync.rs +++ b/zarrs/src/array/chunk_cache/array_chunk_cache_ext_sync.rs @@ -6,10 +6,11 @@ use unsafe_cell_slice::UnsafeCellSlice; use crate::{ array::{ - array_bytes::{merge_chunks_vlen, update_bytes_flen}, - codec::CodecOptions, + array_bytes::merge_chunks_vlen, + codec::{CodecError, CodecOptions}, concurrency::concurrency_chunks_and_codec, - Array, ArrayBytes, ArrayError, ArraySize, DataTypeSize, ElementOwned, + Array, ArrayBytes, ArrayBytesFixedDisjointView, ArrayError, ArraySize, DataTypeSize, + ElementOwned, }, array_subset::ArraySubset, storage::ReadableStorageTraits, @@ -229,7 +230,7 @@ impl ArrayChunkCacheExt Result, ArrayError> { let chunk_representation = self.chunk_array_representation(chunk_indices)?; - if !chunk_subset.inbounds(&chunk_representation.shape_u64()) { + if !chunk_subset.inbounds_shape(&chunk_representation.shape_u64()) { return Err(ArrayError::InvalidArraySubset( chunk_subset.clone(), self.shape().to_vec(), @@ -408,13 +409,18 @@ impl ArrayChunkCacheExt unreachable!(), }; - update_bytes_flen( - &output, - array_subset.shape(), - fixed, - &chunk_subset_overlap.relative_to(array_subset.start())?, - data_type_size, - ); + let mut output_view = unsafe { + // SAFETY: chunks represent disjoint array subsets + ArrayBytesFixedDisjointView::new_unchecked( + output, + data_type_size, + array_subset.shape(), + chunk_subset_overlap.relative_to(array_subset.start())?, + ) + }; + output_view + .copy_from_slice(fixed) + .map_err(CodecError::from)?; Ok::<_, ArrayError>(()) }; iter_concurrent_limit!( diff --git a/zarrs/src/array/chunk_cache/chunk_cache_lru.rs b/zarrs/src/array/chunk_cache/chunk_cache_lru.rs index 73a2f332..fae78303 100644 --- a/zarrs/src/array/chunk_cache/chunk_cache_lru.rs +++ b/zarrs/src/array/chunk_cache/chunk_cache_lru.rs @@ -359,7 +359,7 @@ impl ChunkCache for ChunkCacheDecodedLruSizeLimitThreadLo mod tests { use super::*; - use std::{mem::size_of, sync::Arc}; + use std::sync::Arc; use crate::{ array::{ diff --git a/zarrs/src/array/chunk_grid.rs b/zarrs/src/array/chunk_grid.rs index 79ff9ee7..9f5c3ea8 100644 --- a/zarrs/src/array/chunk_grid.rs +++ b/zarrs/src/array/chunk_grid.rs @@ -127,6 +127,9 @@ impl TryFrom for ChunkGrid { } /// Chunk grid traits. +// TODO: Unsafe trait? ChunkGridTraits has invariants that must be upheld by implementations. +// - chunks must be disjoint for downstream `ArrayBytesFixedDisjoint` construction and otherwise sane behavior +// - this is true for regular and rectangular grids, but a custom grid could violate this pub trait ChunkGridTraits: core::fmt::Debug + Send + Sync { /// Create metadata. fn create_metadata(&self) -> MetadataV3; diff --git a/zarrs/src/array/chunk_grid/rectangular.rs b/zarrs/src/array/chunk_grid/rectangular.rs index 05733fb5..6dcedd59 100644 --- a/zarrs/src/array/chunk_grid/rectangular.rs +++ b/zarrs/src/array/chunk_grid/rectangular.rs @@ -1,5 +1,7 @@ //! The `rectangular` chunk grid. //! +//! This chunk grid is considered experimental as it is based on a draft Zarr enhancement proposal. +//! //! See . use std::num::NonZeroU64; diff --git a/zarrs/src/array/codec.rs b/zarrs/src/array/codec.rs index 234dfdc5..864d88a8 100644 --- a/zarrs/src/array/codec.rs +++ b/zarrs/src/array/codec.rs @@ -13,8 +13,11 @@ pub mod array_to_array; pub mod array_to_bytes; pub mod bytes_to_bytes; +pub mod metadata_options; pub mod options; +use derive_more::derive::Display; +pub use metadata_options::CodecMetadataOptions; pub use options::{CodecOptions, CodecOptionsBuilder}; // Array to array @@ -67,7 +70,6 @@ pub use byte_interval_partial_decoder::ByteIntervalPartialDecoder; #[cfg(feature = "async")] pub use byte_interval_partial_decoder::AsyncByteIntervalPartialDecoder; -use unsafe_cell_slice::UnsafeCellSlice; mod array_partial_encoder_default; pub use array_partial_encoder_default::ArrayPartialEncoderDefault; @@ -77,6 +79,7 @@ pub use array_to_array_partial_encoder_default::ArrayToArrayPartialEncoderDefaul mod bytes_partial_encoder_default; pub use bytes_partial_encoder_default::BytesPartialEncoderDefault; +use zarrs_metadata::ArrayShape; use crate::indexer::IncompatibleIndexerAndShapeError; use crate::storage::{StoreKeyOffsetValue, WritableStorage}; @@ -95,12 +98,12 @@ use std::any::Any; use std::borrow::Cow; use std::sync::Arc; -use super::array_bytes::update_bytes_flen; +use super::RawBytesOffsetsOutOfBoundsError; use super::{ - concurrency::RecommendedConcurrency, ArrayMetadataOptions, BytesRepresentation, - ChunkRepresentation, ChunkShape, DataType, + array_bytes::RawBytesOffsetsCreateError, concurrency::RecommendedConcurrency, ArrayBytes, + ArrayBytesFixedDisjointView, BytesRepresentation, ChunkRepresentation, ChunkShape, DataType, + RawBytes, }; -use super::{ArrayBytes, RawBytes}; /// A codec plugin. pub type CodecPlugin = Plugin; @@ -200,13 +203,13 @@ pub trait CodecTraits: Send + Sync { /// Create metadata. /// /// A hidden codec (e.g. a cache) will return [`None`], since it will not have any associated metadata. - fn create_metadata_opt(&self, options: &ArrayMetadataOptions) -> Option; + fn create_metadata_opt(&self, options: &CodecMetadataOptions) -> Option; /// Create metadata with default options. /// /// A hidden codec (e.g. a cache) will return [`None`], since it will not have any associated metadata. fn create_metadata(&self) -> Option { - self.create_metadata_opt(&ArrayMetadataOptions::default()) + self.create_metadata_opt(&CodecMetadataOptions::default()) } /// Indicates if the input to a codecs partial decoder should be cached for optimal performance. @@ -360,34 +363,26 @@ pub trait ArrayPartialDecoderTraits: Any + Send + Sync { /// Extracted elements from the `array_subset` are written to the subset of the output in C order. /// /// # Errors - /// Returns [`CodecError`] if a codec fails or an array subset is invalid. - /// - /// # Safety - /// The caller must ensure that: - /// - `output` holds enough space for the preallocated bytes of an array with shape `output_shape` of the appropriate data type, - /// - `output_subset` is within the bounds of `output_shape`, and - /// - `output_subset` has the same number of elements as `array_subset`. - unsafe fn partial_decode_into( + /// Returns [`CodecError`] if a codec fails or the number of elements in `array_subset` does not match the number of elements in `output_view`, + fn partial_decode_into( &self, array_subset: &ArraySubset, - output: &UnsafeCellSlice, - output_shape: &[u64], - output_subset: &ArraySubset, + output_view: &mut ArrayBytesFixedDisjointView<'_>, options: &CodecOptions, ) -> Result<(), CodecError> { - debug_assert!(output_subset.inbounds(output_shape)); - debug_assert_eq!(array_subset.num_elements(), output_subset.num_elements()); + if array_subset.num_elements() != output_view.num_elements() { + return Err(InvalidNumberOfElementsError::new( + array_subset.num_elements(), + output_view.num_elements(), + ) + .into()); + } + let decoded_value = self .partial_decode(&[array_subset.clone()], options)? .remove(0); if let ArrayBytes::Fixed(decoded_value) = decoded_value { - update_bytes_flen( - output, - output_shape, - &decoded_value, - output_subset, - self.data_type().fixed_size().unwrap(), - ); + output_view.copy_from_slice(&decoded_value)?; Ok(()) } else { Err(CodecError::ExpectedFixedLengthBytes) @@ -452,28 +447,25 @@ pub trait AsyncArrayPartialDecoderTraits: Any + Send + Sync { /// Async variant of [`ArrayPartialDecoderTraits::partial_decode_into`]. #[allow(clippy::missing_safety_doc)] - async unsafe fn partial_decode_into( + async fn partial_decode_into( &self, array_subset: &ArraySubset, - output: &UnsafeCellSlice, - output_shape: &[u64], - output_subset: &ArraySubset, + output_view: &mut ArrayBytesFixedDisjointView<'_>, options: &CodecOptions, ) -> Result<(), CodecError> { - debug_assert!(output_subset.inbounds(output_shape)); - debug_assert_eq!(array_subset.shape(), output_subset.shape()); + if array_subset.num_elements() != output_view.num_elements() { + return Err(InvalidNumberOfElementsError::new( + output_view.num_elements(), + array_subset.num_elements(), + ) + .into()); + } let decoded_value = self .partial_decode(&[array_subset.clone()], options) .await? .remove(0); if let ArrayBytes::Fixed(decoded_value) = decoded_value { - update_bytes_flen( - output, - output_shape, - &decoded_value, - output_subset, - self.data_type().fixed_size().unwrap(), - ); + output_view.copy_from_slice(&decoded_value)?; Ok(()) } else { Err(CodecError::ExpectedFixedLengthBytes) @@ -711,36 +703,24 @@ pub trait ArrayToBytesCodecTraits: ArrayCodecTraits + core::fmt::Debug { /// Chunk elements are written to the subset of the output in C order. /// /// # Errors - /// Returns [`CodecError`] if a codec fails or the decoded output is incompatible with `decoded_representation`. - /// - /// # Safety - /// The caller must ensure that: - /// - `output` holds enough space for the preallocated bytes of an array with shape `output_shape` of the appropriate data type, and - /// - `output_subset` is within the bounds of `output_shape`, and - /// - `output_subset` has the same number of elements as the decoded representation shape. - unsafe fn decode_into( + /// Returns [`CodecError`] if a codec fails or the number of elements in `decoded_representation` does not match the number of elements in `output_view`, + fn decode_into( &self, bytes: RawBytes<'_>, decoded_representation: &ChunkRepresentation, - output: &UnsafeCellSlice, - output_shape: &[u64], - output_subset: &ArraySubset, + output_view: &mut ArrayBytesFixedDisjointView<'_>, options: &CodecOptions, ) -> Result<(), CodecError> { - debug_assert!(output_subset.inbounds(output_shape)); - debug_assert_eq!( - decoded_representation.num_elements(), - output_subset.num_elements() - ); + if decoded_representation.num_elements() != output_view.num_elements() { + return Err(InvalidNumberOfElementsError::new( + output_view.num_elements(), + decoded_representation.num_elements(), + ) + .into()); + } let decoded_value = self.decode(bytes, decoded_representation, options)?; if let ArrayBytes::Fixed(decoded_value) = decoded_value { - update_bytes_flen( - output, - output_shape, - &decoded_value, - output_subset, - decoded_representation.data_type().fixed_size().unwrap(), - ); + output_view.copy_from_slice(&decoded_value)?; } else { return Err(CodecError::ExpectedFixedLengthBytes); } @@ -961,6 +941,76 @@ impl AsyncBytesPartialDecoderTraits for std::io::Cursor> { } } +/// An error indicating the length of bytes does not match the expected length. +#[derive(Debug, Error, Display)] +#[display("Invalid bytes len {len}, expected {expected_len}")] +pub struct InvalidBytesLengthError { + len: usize, + expected_len: usize, +} + +impl InvalidBytesLengthError { + /// Create a new [`InvalidBytesLengthError`]. + #[must_use] + pub fn new(len: usize, expected_len: usize) -> Self { + Self { len, expected_len } + } +} + +/// An error indicating the shape is not compatible with the expected number of elements. +#[derive(Debug, Error, Display)] +#[display("Invalid shape {shape:?} for number of elements {expected_num_elements}")] +pub struct InvalidArrayShapeError { + shape: ArrayShape, + expected_num_elements: usize, +} + +impl InvalidArrayShapeError { + /// Create a new [`InvalidArrayShapeError`]. + #[must_use] + pub fn new(shape: ArrayShape, expected_num_elements: usize) -> Self { + Self { + shape, + expected_num_elements, + } + } +} + +/// An error indicating the length of elements does not match the expected length. +#[derive(Debug, Error, Display)] +#[display("Invalid number of elements {num}, expected {expected}")] +pub struct InvalidNumberOfElementsError { + num: u64, + expected: u64, +} + +impl InvalidNumberOfElementsError { + /// Create a new [`InvalidNumberOfElementsError`]. + #[must_use] + pub fn new(num: u64, expected: u64) -> Self { + Self { num, expected } + } +} + +/// An array subset is out of bounds. +#[derive(Debug, Error, Display)] +#[display("Subset {subset} is out of bounds of {must_be_within}")] +pub struct SubsetOutOfBoundsError { + subset: ArraySubset, + must_be_within: ArraySubset, +} + +impl SubsetOutOfBoundsError { + /// Create a new [`InvalidNumberOfElementsError`]. + #[must_use] + pub fn new(subset: ArraySubset, must_be_within: ArraySubset) -> Self { + Self { + subset, + must_be_within, + } + } +} + /// A codec error. #[derive(Debug, Error)] pub enum CodecError { @@ -977,8 +1027,8 @@ pub enum CodecError { #[error("the array subset {_0} has the wrong dimensionality, expected {_1}")] InvalidArraySubsetDimensionalityError(ArraySubset, usize), /// The decoded size of a chunk did not match what was expected. - #[error("the size of a decoded chunk is {_0}, expected {_1}")] - UnexpectedChunkDecodedSize(usize, u64), + #[error("the size of a decoded chunk is {}, expected {}", _0.len, _0.expected_len)] + UnexpectedChunkDecodedSize(#[from] InvalidBytesLengthError), /// An embedded checksum does not match the decoded value. #[error("the checksum is invalid")] InvalidChecksum, @@ -1003,6 +1053,21 @@ pub enum CodecError { /// Expected variable length bytes. #[error("Expected variable length array bytes")] ExpectedVariableLengthBytes, + /// Invalid array shape. + #[error(transparent)] + InvalidArrayShape(#[from] InvalidArrayShapeError), + /// Invalid number of elements. + #[error(transparent)] + InvalidNumberOfElements(#[from] InvalidNumberOfElementsError), + /// Subset out of bounds. + #[error(transparent)] + SubsetOutOfBounds(#[from] SubsetOutOfBoundsError), + /// Invalid byte offsets for variable length data. + #[error(transparent)] + RawBytesOffsetsCreate(#[from] RawBytesOffsetsCreateError), + /// Variable length array bytes offsets are out of bounds. + #[error(transparent)] + RawBytesOffsetsOutOfBounds(#[from] RawBytesOffsetsOutOfBoundsError), } impl From<&str> for CodecError { diff --git a/zarrs/src/array/codec/array_partial_encoder_default.rs b/zarrs/src/array/codec/array_partial_encoder_default.rs index 407c07ab..98b78b20 100644 --- a/zarrs/src/array/codec/array_partial_encoder_default.rs +++ b/zarrs/src/array/codec/array_partial_encoder_default.rs @@ -76,15 +76,13 @@ impl ArrayPartialEncoderTraits for ArrayPartialEncoderDefault { self.decoded_representation.data_type().size(), )?; - chunk_bytes = unsafe { - update_array_bytes( - chunk_bytes, - &chunk_shape, - chunk_subset, - chunk_subset_bytes, - self.decoded_representation.data_type().size(), - ) - }; + chunk_bytes = update_array_bytes( + chunk_bytes, + &chunk_shape, + chunk_subset, + chunk_subset_bytes, + self.decoded_representation.data_type().size(), + )?; } let is_fill_value = !options.store_empty_chunks() diff --git a/zarrs/src/array/codec/array_to_array/bitround/bitround_codec.rs b/zarrs/src/array/codec/array_to_array/bitround/bitround_codec.rs index 51c8249f..fc288375 100644 --- a/zarrs/src/array/codec/array_to_array/bitround/bitround_codec.rs +++ b/zarrs/src/array/codec/array_to_array/bitround/bitround_codec.rs @@ -5,9 +5,9 @@ use crate::{ codec::{ options::CodecOptions, ArrayBytes, ArrayCodecTraits, ArrayPartialDecoderTraits, ArrayPartialEncoderTraits, ArrayToArrayCodecTraits, ArrayToArrayPartialEncoderDefault, - CodecError, CodecTraits, RecommendedConcurrency, + CodecError, CodecMetadataOptions, CodecTraits, RecommendedConcurrency, }, - ArrayMetadataOptions, ChunkRepresentation, ChunkShape, DataType, + ChunkRepresentation, ChunkShape, DataType, }, config::global_config, metadata::v3::MetadataV3, @@ -47,7 +47,7 @@ impl BitroundCodec { } impl CodecTraits for BitroundCodec { - fn create_metadata_opt(&self, options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, options: &CodecMetadataOptions) -> Option { if options.experimental_codec_store_metadata_if_encode_only() { let configuration = BitroundCodecConfigurationV1 { keepbits: self.keepbits, diff --git a/zarrs/src/array/codec/array_to_array/transpose.rs b/zarrs/src/array/codec/array_to_array/transpose.rs index 7818ccf7..8e0f1ec5 100644 --- a/zarrs/src/array/codec/array_to_array/transpose.rs +++ b/zarrs/src/array/codec/array_to_array/transpose.rs @@ -120,8 +120,15 @@ fn transpose_vlen<'a>( bytes_new.extend_from_slice(&bytes[curr..next]); } offsets_new.push(bytes_new.len()); - - ArrayBytes::new_vlen(bytes_new, offsets_new) + let offsets_new = unsafe { + // SAFETY: The offsets are monotonically increasing. + RawBytesOffsets::new_unchecked(offsets_new) + }; + let array_bytes = unsafe { + // SAFETY: The last offset is equal to the length of the bytes + ArrayBytes::new_vlen_unchecked(bytes_new, offsets_new) + }; + array_bytes } #[cfg(test)] diff --git a/zarrs/src/array/codec/array_to_array/transpose/transpose_codec.rs b/zarrs/src/array/codec/array_to_array/transpose/transpose_codec.rs index 1ac8a968..85032f30 100644 --- a/zarrs/src/array/codec/array_to_array/transpose/transpose_codec.rs +++ b/zarrs/src/array/codec/array_to_array/transpose/transpose_codec.rs @@ -5,9 +5,9 @@ use crate::{ codec::{ options::CodecOptions, ArrayBytes, ArrayCodecTraits, ArrayPartialDecoderTraits, ArrayPartialEncoderTraits, ArrayToArrayCodecTraits, ArrayToArrayPartialEncoderDefault, - CodecError, CodecTraits, RecommendedConcurrency, + CodecError, CodecMetadataOptions, CodecTraits, RecommendedConcurrency, }, - ArrayMetadataOptions, ChunkRepresentation, ChunkShape, + ChunkRepresentation, ChunkShape, }, metadata::v3::{array::codec::transpose::TransposeCodecConfigurationV1, MetadataV3}, plugin::PluginCreateError, @@ -48,7 +48,7 @@ impl TransposeCodec { } impl CodecTraits for TransposeCodec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let configuration = TransposeCodecConfigurationV1 { order: self.order.clone(), }; diff --git a/zarrs/src/array/codec/array_to_array_partial_encoder_default.rs b/zarrs/src/array/codec/array_to_array_partial_encoder_default.rs index 5c854b8c..2cea5e05 100644 --- a/zarrs/src/array/codec/array_to_array_partial_encoder_default.rs +++ b/zarrs/src/array/codec/array_to_array_partial_encoder_default.rs @@ -87,15 +87,13 @@ impl ArrayPartialEncoderTraits for ArrayToArrayPartialEncoderDefault { self.decoded_representation.data_type().size(), )?; - decoded_value = unsafe { - update_array_bytes( - decoded_value, - &chunk_shape, - chunk_subset, - chunk_subset_bytes, - self.decoded_representation.data_type().size(), - ) - }; + decoded_value = update_array_bytes( + decoded_value, + &chunk_shape, + chunk_subset, + chunk_subset_bytes, + self.decoded_representation.data_type().size(), + )?; } let is_fill_value = !options.store_empty_chunks() diff --git a/zarrs/src/array/codec/array_to_bytes/bytes.rs b/zarrs/src/array/codec/array_to_bytes/bytes.rs index 98ba7fc9..4962dc1e 100644 --- a/zarrs/src/array/codec/array_to_bytes/bytes.rs +++ b/zarrs/src/array/codec/array_to_bytes/bytes.rs @@ -73,6 +73,10 @@ pub(crate) fn reverse_endianness(v: &mut [u8], data_type: &DataType) { } // Variable-sized data types are not supported and are rejected outside of this function DataType::String | DataType::Bytes => unreachable!(), + _ => { + // FIXME: Data type extensions, endianness reversal for custom data types + unimplemented!("Reverse endianness for data type {:?}", data_type) + } } } @@ -297,7 +301,7 @@ mod tests { .map(|bytes| bytes.into_fixed().unwrap().to_vec()) .flatten() .collect::>() - .chunks(std::mem::size_of::()) + .chunks(size_of::()) .map(|b| u8::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![4, 8]; @@ -343,7 +347,7 @@ mod tests { .map(|bytes| bytes.into_fixed().unwrap().to_vec()) .flatten() .collect::>() - .chunks(std::mem::size_of::()) + .chunks(size_of::()) .map(|b| u8::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![4, 8]; diff --git a/zarrs/src/array/codec/array_to_bytes/bytes/bytes_codec.rs b/zarrs/src/array/codec/array_to_bytes/bytes/bytes_codec.rs index f6cff4a3..a199d326 100644 --- a/zarrs/src/array/codec/array_to_bytes/bytes/bytes_codec.rs +++ b/zarrs/src/array/codec/array_to_bytes/bytes/bytes_codec.rs @@ -7,11 +7,10 @@ use crate::{ codec::{ ArrayCodecTraits, ArrayPartialDecoderTraits, ArrayPartialEncoderDefault, ArrayPartialEncoderTraits, ArrayToBytesCodecTraits, BytesPartialDecoderTraits, - BytesPartialEncoderTraits, CodecError, CodecOptions, CodecTraits, - RecommendedConcurrency, + BytesPartialEncoderTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, + InvalidBytesLengthError, RecommendedConcurrency, }, - ArrayBytes, ArrayMetadataOptions, BytesRepresentation, ChunkRepresentation, DataTypeSize, - RawBytes, + ArrayBytes, BytesRepresentation, ChunkRepresentation, DataTypeSize, RawBytes, }, metadata::v3::MetadataV3, }; @@ -77,12 +76,11 @@ impl BytesCodec { )); } DataTypeSize::Fixed(data_type_size) => { - let array_size = decoded_representation.num_elements() * data_type_size as u64; - if value.len() as u64 != array_size { - return Err(CodecError::UnexpectedChunkDecodedSize( - value.len(), - array_size, - )); + let array_size = + usize::try_from(decoded_representation.num_elements() * data_type_size as u64) + .unwrap(); + if value.len() != array_size { + return Err(InvalidBytesLengthError::new(value.len(), array_size).into()); } else if data_type_size > 1 && self.endian.is_none() { return Err(CodecError::Other(format!( "tried to encode an array with element size {data_type_size} with endianness None" @@ -101,7 +99,7 @@ impl BytesCodec { } impl CodecTraits for BytesCodec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let configuration = BytesCodecConfigurationV1 { endian: self.endian, }; diff --git a/zarrs/src/array/codec/array_to_bytes/codec_chain.rs b/zarrs/src/array/codec/array_to_bytes/codec_chain.rs index 2b41fe63..8fc9bd95 100644 --- a/zarrs/src/array/codec/array_to_bytes/codec_chain.rs +++ b/zarrs/src/array/codec/array_to_bytes/codec_chain.rs @@ -2,22 +2,19 @@ use std::sync::Arc; -use unsafe_cell_slice::UnsafeCellSlice; - use crate::{ array::{ - array_bytes::update_bytes_flen, codec::{ ArrayCodecTraits, ArrayPartialDecoderCache, ArrayPartialDecoderTraits, ArrayPartialEncoderTraits, ArrayToArrayCodecTraits, ArrayToBytesCodecTraits, BytesPartialDecoderCache, BytesPartialDecoderTraits, BytesPartialEncoderTraits, - BytesToBytesCodecTraits, Codec, CodecError, CodecOptions, CodecTraits, + BytesToBytesCodecTraits, Codec, CodecError, CodecMetadataOptions, CodecOptions, + CodecTraits, }, concurrency::RecommendedConcurrency, - ArrayBytes, ArrayMetadataOptions, BytesRepresentation, ChunkRepresentation, ChunkShape, - RawBytes, + ArrayBytes, ArrayBytesFixedDisjointView, BytesRepresentation, ChunkRepresentation, + ChunkShape, RawBytes, }, - array_subset::ArraySubset, metadata::v3::MetadataV3, plugin::PluginCreateError, }; @@ -137,7 +134,7 @@ impl CodecChain { /// Create codec chain metadata. #[must_use] - pub fn create_metadatas_opt(&self, options: &ArrayMetadataOptions) -> Vec { + pub fn create_metadatas_opt(&self, options: &CodecMetadataOptions) -> Vec { let mut metadatas = Vec::with_capacity(self.array_to_array.len() + 1 + self.bytes_to_bytes.len()); for codec in &self.array_to_array { @@ -159,7 +156,7 @@ impl CodecChain { /// Create codec chain metadata with default options. #[must_use] pub fn create_metadatas(&self) -> Vec { - self.create_metadatas_opt(&ArrayMetadataOptions::default()) + self.create_metadatas_opt(&CodecMetadataOptions::default()) } /// Get the array to array codecs @@ -215,7 +212,7 @@ impl CodecTraits for CodecChain { /// Returns [`None`] since a codec chain does not have standard codec metadata. /// /// Note that usage of the codec chain is explicit in [`Array`](crate::array::Array) and [`CodecChain::create_metadatas_opt()`] will call [`CodecTraits::create_metadata_opt()`] from for each codec. - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { None } @@ -309,13 +306,11 @@ impl ArrayToBytesCodecTraits for CodecChain { Ok(bytes) } - unsafe fn decode_into( + fn decode_into( &self, mut bytes: RawBytes<'_>, decoded_representation: &ChunkRepresentation, - output: &UnsafeCellSlice, - output_shape: &[u64], - output_subset: &ArraySubset, + output_view: &mut ArrayBytesFixedDisjointView<'_>, options: &CodecOptions, ) -> Result<(), CodecError> { let array_representations = @@ -325,16 +320,12 @@ impl ArrayToBytesCodecTraits for CodecChain { if self.bytes_to_bytes.is_empty() && self.array_to_array.is_empty() { // Fast path if no bytes to bytes or array to array codecs - return unsafe { - self.array_to_bytes.decode_into( - bytes, - array_representations.last().unwrap(), - output, - output_shape, - output_subset, - options, - ) - }; + return self.array_to_bytes.decode_into( + bytes, + array_representations.last().unwrap(), + output_view, + options, + ); } // bytes->bytes @@ -347,16 +338,12 @@ impl ArrayToBytesCodecTraits for CodecChain { if self.array_to_array.is_empty() { // Fast path if no array to array codecs - return unsafe { - self.array_to_bytes.decode_into( - bytes, - array_representations.last().unwrap(), - output, - output_shape, - output_subset, - options, - ) - }; + return self.array_to_bytes.decode_into( + bytes, + array_representations.last().unwrap(), + output_view, + options, + ); } // bytes->array @@ -377,13 +364,7 @@ impl ArrayToBytesCodecTraits for CodecChain { )?; if let ArrayBytes::Fixed(decoded_value) = bytes { - update_bytes_flen( - output, - output_shape, - &decoded_value, - output_subset, - decoded_representation.data_type().fixed_size().unwrap(), - ); + output_view.copy_from_slice(&decoded_value)?; } else { // TODO: Variable length data type support? return Err(CodecError::ExpectedFixedLengthBytes); @@ -852,7 +833,7 @@ mod tests { .map(|bytes| bytes.into_fixed().unwrap().to_vec()) .flatten() .collect::>() - .chunks(std::mem::size_of::()) + .chunks(size_of::()) .map(|b| f32::from_ne_bytes(b.try_into().unwrap())) .collect(); println!("decoded_partial_chunk {decoded_partial_chunk:?}"); diff --git a/zarrs/src/array/codec/array_to_bytes/pcodec.rs b/zarrs/src/array/codec/array_to_bytes/pcodec.rs index cf77b2ab..47cc8904 100644 --- a/zarrs/src/array/codec/array_to_bytes/pcodec.rs +++ b/zarrs/src/array/codec/array_to_bytes/pcodec.rs @@ -271,7 +271,7 @@ mod tests { .map(|bytes| bytes.into_fixed().unwrap().into_owned()) .flatten() .collect::>() - .chunks(std::mem::size_of::()) + .chunks(size_of::()) .map(|b| u8::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![4, 8]; @@ -323,7 +323,7 @@ mod tests { .map(|bytes| bytes.into_fixed().unwrap().into_owned()) .flatten() .collect::>() - .chunks(std::mem::size_of::()) + .chunks(size_of::()) .map(|b| u8::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![4, 8]; diff --git a/zarrs/src/array/codec/array_to_bytes/pcodec/pcodec_codec.rs b/zarrs/src/array/codec/array_to_bytes/pcodec/pcodec_codec.rs index 10c9f6db..94b47549 100644 --- a/zarrs/src/array/codec/array_to_bytes/pcodec/pcodec_codec.rs +++ b/zarrs/src/array/codec/array_to_bytes/pcodec/pcodec_codec.rs @@ -10,11 +10,11 @@ use crate::{ codec::{ ArrayBytes, ArrayCodecTraits, ArrayPartialDecoderTraits, ArrayPartialEncoderDefault, ArrayPartialEncoderTraits, ArrayToBytesCodecTraits, BytesPartialDecoderTraits, - BytesPartialEncoderTraits, CodecError, CodecOptions, CodecTraits, RawBytes, - RecommendedConcurrency, + BytesPartialEncoderTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, + RawBytes, RecommendedConcurrency, }, - convert_from_bytes_slice, transmute_to_bytes_vec, ArrayMetadataOptions, - BytesRepresentation, ChunkRepresentation, DataType, + convert_from_bytes_slice, transmute_to_bytes_vec, BytesRepresentation, ChunkRepresentation, + DataType, }, config::global_config, metadata::v3::{array::codec::pcodec::PcodecModeSpecConfiguration, MetadataV3}, @@ -84,7 +84,7 @@ impl PcodecCodec { } impl CodecTraits for PcodecCodec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let mode_spec = mode_spec_pco_to_config(&self.chunk_config.mode_spec); let (delta_spec, delta_encoding_order) = match self.chunk_config.delta_spec { DeltaSpec::Auto => (PcodecDeltaSpecConfiguration::Auto, None), diff --git a/zarrs/src/array/codec/array_to_bytes/sharding.rs b/zarrs/src/array/codec/array_to_bytes/sharding.rs index b0a7e216..7e2a5955 100644 --- a/zarrs/src/array/codec/array_to_bytes/sharding.rs +++ b/zarrs/src/array/codec/array_to_bytes/sharding.rs @@ -111,7 +111,7 @@ fn decode_shard_index( )?; let decoded_shard_index = decoded_shard_index.into_fixed()?; Ok(decoded_shard_index - .chunks_exact(core::mem::size_of::()) + .chunks_exact(size_of::()) .map(|v| u64::from_ne_bytes(v.try_into().unwrap() /* safe */)) .collect()) } @@ -499,7 +499,7 @@ mod tests { .map(|bytes| bytes.into_fixed().unwrap().to_vec()) .flatten() .collect::>() - .chunks(std::mem::size_of::()) + .chunks(size_of::()) .map(|b| u8::from_ne_bytes(b.try_into().unwrap())) .collect(); assert_eq!(answer, decoded_partial_chunk); @@ -584,7 +584,7 @@ mod tests { .map(|bytes| bytes.into_fixed().unwrap().to_vec()) .flatten() .collect::>() - .chunks(std::mem::size_of::()) + .chunks(size_of::()) .map(|b| u8::from_ne_bytes(b.try_into().unwrap())) .collect(); assert_eq!(answer, decoded_partial_chunk); @@ -653,7 +653,7 @@ mod tests { .map(|bytes| bytes.into_fixed().unwrap().to_vec()) .flatten() .collect::>() - .chunks(std::mem::size_of::()) + .chunks(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); @@ -695,7 +695,7 @@ mod tests { .map(|bytes| bytes.into_fixed().unwrap().to_vec()) .flatten() .collect::>() - .chunks(std::mem::size_of::()) + .chunks(size_of::()) .map(|b| u8::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![4, 8]; diff --git a/zarrs/src/array/codec/array_to_bytes/sharding/sharding_codec.rs b/zarrs/src/array/codec/array_to_bytes/sharding/sharding_codec.rs index f6ec782f..ccf76f54 100644 --- a/zarrs/src/array/codec/array_to_bytes/sharding/sharding_codec.rs +++ b/zarrs/src/array/codec/array_to_bytes/sharding/sharding_codec.rs @@ -6,16 +6,17 @@ use std::{ use crate::{ array::{ - array_bytes::{merge_chunks_vlen, update_bytes_flen}, + array_bytes::merge_chunks_vlen, chunk_shape_to_array_shape, codec::{ ArrayCodecTraits, ArrayPartialDecoderTraits, ArrayPartialEncoderTraits, ArrayToBytesCodecTraits, BytesPartialDecoderTraits, BytesPartialEncoderTraits, - CodecChain, CodecError, CodecOptions, CodecTraits, RecommendedConcurrency, + CodecChain, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, + RecommendedConcurrency, }, concurrency::calc_concurrency_outer_inner, - transmute_to_bytes_vec, unravel_index, ArrayBytes, ArrayMetadataOptions, ArraySize, - BytesRepresentation, ChunkRepresentation, ChunkShape, DataTypeSize, FillValue, RawBytes, + transmute_to_bytes_vec, unravel_index, ArrayBytes, ArrayBytesFixedDisjointView, ArraySize, + BytesRepresentation, ChunkRepresentation, ChunkShape, DataTypeSize, RawBytes, }, array_subset::ArraySubset, metadata::v3::MetadataV3, @@ -85,7 +86,7 @@ impl ShardingCodec { } impl CodecTraits for ShardingCodec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let configuration = ShardingCodecConfigurationV1 { chunk_shape: self.chunk_shape.clone(), codecs: self.inner_codecs.create_metadatas(), @@ -104,21 +105,6 @@ impl CodecTraits for ShardingCodec { } } -/// Repeat the fill value into a contiguous vec -/// The length is the contiguous elements of an inner chunk in the shard. See `ContiguousLinearisedIndices`. -fn get_contiguous_fill_value( - fill_value: &FillValue, - chunk_shape: &[NonZeroU64], - shard_shape: &[u64], -) -> Vec { - let chunk_subset = ArraySubset::new_with_shape(chunk_shape_to_array_shape(chunk_shape)); - let contiguous_iterator = - unsafe { chunk_subset.contiguous_linearised_indices_unchecked(shard_shape) }; - fill_value - .as_ne_bytes() - .repeat(contiguous_iterator.contiguous_elements_usize()) -} - impl ArrayCodecTraits for ShardingCodec { fn recommended_concurrency( &self, @@ -182,7 +168,6 @@ impl ArrayToBytesCodecTraits for ShardingCodec { shard_representation: &ChunkRepresentation, options: &CodecOptions, ) -> Result, CodecError> { - let shard_shape = shard_representation.shape_u64(); let chunk_representation = unsafe { ChunkRepresentation::new_unchecked( self.chunk_shape.as_slice().to_vec(), @@ -201,10 +186,6 @@ impl ArrayToBytesCodecTraits for ShardingCodec { let shard_index = self.decode_index(&encoded_shard, chunks_per_shard.as_slice(), options)?; - let any_empty = shard_index - .par_iter() - .any(|offset_or_size| *offset_or_size == u64::MAX); - // Calc self/internal concurrent limits let (shard_concurrent_limit, concurrency_limit_inner_chunks) = calc_concurrency_outer_inner( options.concurrent_target(), @@ -271,49 +252,29 @@ impl ArrayToBytesCodecTraits for ShardingCodec { } let mut decoded_shard = Vec::::with_capacity(size_output); - let contiguous_fill_value = if any_empty { - Some(get_contiguous_fill_value( - shard_representation.fill_value(), - &self.chunk_shape, - &shard_shape, - )) - } else { - None - }; - { let output = UnsafeCellSlice::new_from_vec_with_spare_capacity(&mut decoded_shard); + let shard_shape = shard_representation.shape_u64(); let decode_chunk = |chunk_index: usize| { let chunk_subset = self .chunk_index_to_subset(chunk_index as u64, chunks_per_shard.as_slice()); + let mut output_view_inner_chunk = unsafe { + // SAFETY: chunks represent disjoint array subsets + ArrayBytesFixedDisjointView::new_unchecked( + output, + data_type_size, + &shard_shape, + chunk_subset, + ) + }; // Read the offset/size let offset = shard_index[chunk_index * 2]; let size = shard_index[chunk_index * 2 + 1]; if offset == u64::MAX && size == u64::MAX { - if let Some(fv) = &contiguous_fill_value { - let contiguous_iterator = unsafe { - chunk_subset - .contiguous_linearised_indices_unchecked(&shard_shape) - }; - let elements = contiguous_iterator.contiguous_elements(); - for index in &contiguous_iterator { - debug_assert_eq!( - fv.len() as u64, - elements * data_type_size as u64 - ); - let shard_offset = - usize::try_from(index * data_type_size as u64).unwrap(); - unsafe { - output - .index_mut(shard_offset..shard_offset + fv.len()) - .copy_from_slice(fv); - } - } - } else { - unreachable!(); - } + output_view_inner_chunk + .fill(shard_representation.fill_value().as_ne_bytes())?; } else if usize::try_from(offset + size).unwrap() > encoded_shard.len() { return Err(CodecError::Other( "The shard index references out-of-bounds bytes. The chunk may be corrupted." @@ -328,13 +289,9 @@ impl ArrayToBytesCodecTraits for ShardingCodec { &chunk_representation, &options, )?; - update_bytes_flen( - &output, - &shard_representation.shape_u64(), - &decoded_chunk.into_fixed()?, - &chunk_subset, - data_type_size, - ); + output_view_inner_chunk + .copy_from_slice(&decoded_chunk.into_fixed()?) + .map_err(CodecError::from)?; }; Ok::<_, CodecError>(()) @@ -354,16 +311,13 @@ impl ArrayToBytesCodecTraits for ShardingCodec { } #[allow(clippy::too_many_lines)] - unsafe fn decode_into( + fn decode_into( &self, encoded_shard: RawBytes<'_>, shard_representation: &ChunkRepresentation, - output: &UnsafeCellSlice, - output_shape: &[u64], - output_subset: &ArraySubset, + output_view: &mut ArrayBytesFixedDisjointView<'_>, options: &CodecOptions, ) -> Result<(), CodecError> { - let shard_shape = shard_representation.shape_u64(); let chunk_representation = unsafe { ChunkRepresentation::new_unchecked( self.chunk_shape.as_slice().to_vec(), @@ -382,10 +336,6 @@ impl ArrayToBytesCodecTraits for ShardingCodec { let shard_index = self.decode_index(&encoded_shard, chunks_per_shard.as_slice(), options)?; - let any_empty = shard_index - .par_iter() - .any(|offset_or_size| *offset_or_size == u64::MAX); - // Calc self/internal concurrent limits let (shard_concurrent_limit, concurrency_limit_inner_chunks) = calc_concurrency_outer_inner( options.concurrent_target(), @@ -399,96 +349,55 @@ impl ArrayToBytesCodecTraits for ShardingCodec { .concurrent_target(concurrency_limit_inner_chunks) .build(); - match shard_representation.data_type().size() { - DataTypeSize::Variable => { - // TODO: Variable length data type support? - Err(CodecError::ExpectedFixedLengthBytes) - } - DataTypeSize::Fixed(data_type_size) => { - let contiguous_fill_value = if any_empty { - Some(get_contiguous_fill_value( - shard_representation.fill_value(), - &self.chunk_shape, - &shard_shape, - )) - } else { - None - }; - - { - let decode_chunk = |chunk_index: usize| { - let chunk_subset = self - .chunk_index_to_subset(chunk_index as u64, chunks_per_shard.as_slice()); + let decode_chunk = |chunk_index: usize| { + let chunk_subset = + self.chunk_index_to_subset(chunk_index as u64, chunks_per_shard.as_slice()); - let output_subset_chunk = ArraySubset::new_with_start_shape( - std::iter::zip(output_subset.start(), chunk_subset.start()) - .map(|(o, s)| o + s) - .collect(), - chunk_subset.shape().to_vec(), - ) - .unwrap(); + let output_subset_chunk = ArraySubset::new_with_start_shape( + std::iter::zip(output_view.subset().start(), chunk_subset.start()) + .map(|(o, s)| o + s) + .collect(), + chunk_subset.shape().to_vec(), + ) + .unwrap(); + let mut output_view_inner_chunk = unsafe { + // SAFETY: inner chunks represent disjoint array subsets + output_view.subdivide_unchecked(output_subset_chunk) + }; - // Read the offset/size - let offset = shard_index[chunk_index * 2]; - let size = shard_index[chunk_index * 2 + 1]; - if offset == u64::MAX && size == u64::MAX { - if let Some(fv) = &contiguous_fill_value { - let contiguous_iterator = unsafe { - output_subset_chunk - .contiguous_linearised_indices_unchecked(output_shape) - }; - let elements = contiguous_iterator.contiguous_elements(); - for index in &contiguous_iterator { - debug_assert_eq!( - fv.len() as u64, - elements * data_type_size as u64 - ); - let shard_offset = - usize::try_from(index * data_type_size as u64).unwrap(); - unsafe { - output - .index_mut(shard_offset..shard_offset + fv.len()) - .copy_from_slice(fv); - } - } - } else { - unreachable!(); - } - } else if usize::try_from(offset + size).unwrap() > encoded_shard.len() { - return Err(CodecError::Other( - "The shard index references out-of-bounds bytes. The chunk may be corrupted." - .to_string(), - )); - } else { - let offset: usize = offset.try_into().unwrap(); - let size: usize = size.try_into().unwrap(); - let encoded_chunk = &encoded_shard[offset..offset + size]; - unsafe { - self.inner_codecs.decode_into( - Cow::Borrowed(encoded_chunk), - &chunk_representation, - output, - output_shape, - &output_subset_chunk, - &options, - )?; - } - }; + // Read the offset/size + let offset = shard_index[chunk_index * 2]; + let size = shard_index[chunk_index * 2 + 1]; + if offset == u64::MAX && size == u64::MAX { + output_view_inner_chunk.fill(shard_representation.fill_value().as_ne_bytes())?; + } else if usize::try_from(offset + size).unwrap() > encoded_shard.len() { + return Err(CodecError::Other( + "The shard index references out-of-bounds bytes. The chunk may be corrupted." + .to_string(), + )); + } else { + let offset: usize = offset.try_into().unwrap(); + let size: usize = size.try_into().unwrap(); + let encoded_chunk = &encoded_shard[offset..offset + size]; + self.inner_codecs.decode_into( + Cow::Borrowed(encoded_chunk), + &chunk_representation, + &mut output_view_inner_chunk, + &options, + )?; + }; - Ok::<_, CodecError>(()) - }; + Ok::<_, CodecError>(()) + }; - rayon_iter_concurrent_limit::iter_concurrent_limit!( - shard_concurrent_limit, - (0..num_chunks), - try_for_each, - decode_chunk - )?; + rayon_iter_concurrent_limit::iter_concurrent_limit!( + shard_concurrent_limit, + (0..num_chunks), + try_for_each, + decode_chunk + )?; - Ok(()) - } - } - } + Ok(()) } fn partial_decoder( diff --git a/zarrs/src/array/codec/array_to_bytes/sharding/sharding_partial_decoder.rs b/zarrs/src/array/codec/array_to_bytes/sharding/sharding_partial_decoder.rs index bc90361e..303b70b0 100644 --- a/zarrs/src/array/codec/array_to_bytes/sharding/sharding_partial_decoder.rs +++ b/zarrs/src/array/codec/array_to_bytes/sharding/sharding_partial_decoder.rs @@ -5,15 +5,15 @@ use unsafe_cell_slice::UnsafeCellSlice; use zarrs_storage::byte_range::ByteRange; use crate::array::{ - array_bytes::{merge_chunks_vlen, update_bytes_flen}, + array_bytes::merge_chunks_vlen, codec::{ ArrayCodecTraits, ArrayPartialDecoderTraits, ArraySubset, ArrayToBytesCodecTraits, ByteIntervalPartialDecoder, BytesPartialDecoderTraits, CodecChain, CodecError, CodecOptions, }, concurrency::{calc_concurrency_outer_inner, RecommendedConcurrency}, - ravel_indices, ArrayBytes, ArraySize, ChunkRepresentation, ChunkShape, DataType, DataTypeSize, - RawBytes, + ravel_indices, ArrayBytes, ArrayBytesFixedDisjointView, ArraySize, ChunkRepresentation, + ChunkShape, DataType, DataTypeSize, RawBytes, }; #[cfg(feature = "async")] @@ -305,16 +305,20 @@ impl ArrayPartialDecoderTraits for ShardingPartialDecoder { .into_owned() }; let decoded_bytes = decoded_bytes.into_fixed()?; - update_bytes_flen( - &out_array_subset_slice, - array_subset.shape(), - &decoded_bytes, - &chunk_subset_overlap - .relative_to(array_subset.start()) - .unwrap(), - data_type_size, - ); - Ok::<_, CodecError>(()) + let mut output_view = unsafe { + // SAFETY: chunks represent disjoint array subsets + ArrayBytesFixedDisjointView::new_unchecked( + out_array_subset_slice, + data_type_size, + array_subset.shape(), + chunk_subset_overlap + .relative_to(array_subset.start()) + .unwrap(), + ) + }; + output_view + .copy_from_slice(&decoded_bytes) + .map_err(CodecError::from) }; rayon_iter_concurrent_limit::iter_concurrent_limit!( @@ -597,15 +601,20 @@ impl AsyncArrayPartialDecoderTraits for AsyncShardingPartialDecoder { Vec, ArraySubset, ) = subset_and_decoded_chunk?; - update_bytes_flen( - &shard_slice, - array_subset.shape(), - &chunk_subset_bytes.into(), - &chunk_subset_overlap - .relative_to(array_subset.start()) - .unwrap(), - data_type_size, - ); + let mut output_view = unsafe { + // SAFETY: chunks represent disjoint array subsets + ArrayBytesFixedDisjointView::new_unchecked( + shard_slice, + data_type_size, + array_subset.shape(), + chunk_subset_overlap + .relative_to(array_subset.start()) + .unwrap(), + ) + }; + output_view + .copy_from_slice(&chunk_subset_bytes) + .expect("chunk subset bytes are the correct length"); Ok::<_, CodecError>(()) } )?; @@ -627,26 +636,26 @@ impl AsyncArrayPartialDecoderTraits for AsyncShardingPartialDecoder { rayon_iter_concurrent_limit::iter_concurrent_limit!( options.concurrent_target(), filled_chunks, - for_each, + try_for_each, |chunk_subset: &ArraySubset| { let chunk_subset_overlap = unsafe { array_subset.overlap_unchecked(chunk_subset) }; - let filled_chunk = self - .decoded_representation - .fill_value() - .as_ne_bytes() - .repeat(chunk_subset_overlap.num_elements_usize()); - update_bytes_flen( - &shard_slice, - array_subset.shape(), - &filled_chunk.into(), - &chunk_subset_overlap - .relative_to(array_subset.start()) - .unwrap(), - data_type_size, - ); + let mut output_view = unsafe { + // SAFETY: chunks represent disjoint array subsets + ArrayBytesFixedDisjointView::new_unchecked( + shard_slice, + data_type_size, + array_subset.shape(), + chunk_subset_overlap + .relative_to(array_subset.start()) + .unwrap(), + ) + }; + output_view + .fill(self.decoded_representation.fill_value().as_ne_bytes()) + .map_err(CodecError::from) } - ); + )?; }; unsafe { shard.set_len(shard_size) }; out.push(ArrayBytes::from(shard)); diff --git a/zarrs/src/array/codec/array_to_bytes/sharding/sharding_partial_encoder.rs b/zarrs/src/array/codec/array_to_bytes/sharding/sharding_partial_encoder.rs index 2a5abbac..e01c9912 100644 --- a/zarrs/src/array/codec/array_to_bytes/sharding/sharding_partial_encoder.rs +++ b/zarrs/src/array/codec/array_to_bytes/sharding/sharding_partial_encoder.rs @@ -304,17 +304,15 @@ impl ArrayPartialEncoderTraits for ShardingPartialEncoder { }; // Update the inner chunk - let inner_chunk_updated = unsafe { - update_array_bytes( - inner_chunk_decoded, - &self.inner_chunk_representation.shape_u64(), - &inner_chunk_subset_overlap - .relative_to(inner_chunk_subset.start()) - .unwrap(), - &inner_chunk_bytes, - self.inner_chunk_representation.data_type().size(), - ) - }; + let inner_chunk_updated = update_array_bytes( + inner_chunk_decoded, + &self.inner_chunk_representation.shape_u64(), + &inner_chunk_subset_overlap + .relative_to(inner_chunk_subset.start()) + .unwrap(), + &inner_chunk_bytes, + self.inner_chunk_representation.data_type().size(), + )?; inner_chunks_decoded .lock() .unwrap() diff --git a/zarrs/src/array/codec/array_to_bytes/vlen.rs b/zarrs/src/array/codec/array_to_bytes/vlen.rs index 4075aa32..b246c08e 100644 --- a/zarrs/src/array/codec/array_to_bytes/vlen.rs +++ b/zarrs/src/array/codec/array_to_bytes/vlen.rs @@ -3,7 +3,7 @@ mod vlen_codec; mod vlen_partial_decoder; -use std::{mem::size_of, num::NonZeroU64, sync::Arc}; +use std::{num::NonZeroU64, sync::Arc}; use itertools::Itertools; pub use vlen::IDENTIFIER; @@ -13,7 +13,7 @@ pub use crate::metadata::v3::array::codec::vlen::{ }; use crate::{ array::{ - codec::{ArrayToBytesCodecTraits, CodecError, CodecOptions}, + codec::{ArrayToBytesCodecTraits, CodecError, CodecOptions, InvalidBytesLengthError}, convert_from_bytes_slice, ChunkRepresentation, CodecChain, DataType, Endianness, FillValue, RawBytes, }, @@ -62,10 +62,7 @@ fn get_vlen_bytes_and_offsets( ) -> Result<(Vec, Vec), CodecError> { // Get the index length and data start if bytes.len() < size_of::() { - return Err(CodecError::UnexpectedChunkDecodedSize( - bytes.len(), - size_of::() as u64, - )); + return Err(InvalidBytesLengthError::new(bytes.len(), size_of::()).into()); } let index_len = u64::from_le_bytes(bytes[0..size_of::()].try_into().unwrap()); let index_len = usize::try_from(index_len) diff --git a/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs b/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs index 4c181488..9e0a23a3 100644 --- a/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs +++ b/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs @@ -1,15 +1,15 @@ -use std::{mem::size_of, num::NonZeroU64, sync::Arc}; +use std::{num::NonZeroU64, sync::Arc}; use crate::{ array::{ codec::{ ArrayCodecTraits, ArrayPartialDecoderTraits, ArrayPartialEncoderDefault, ArrayPartialEncoderTraits, ArrayToBytesCodecTraits, BytesCodec, - BytesPartialDecoderTraits, BytesPartialEncoderTraits, CodecError, CodecOptions, - CodecTraits, RecommendedConcurrency, + BytesPartialDecoderTraits, BytesPartialEncoderTraits, CodecError, CodecMetadataOptions, + CodecOptions, CodecTraits, RecommendedConcurrency, }, - transmute_to_bytes_vec, ArrayBytes, ArrayMetadataOptions, BytesRepresentation, - ChunkRepresentation, CodecChain, DataType, DataTypeSize, Endianness, FillValue, RawBytes, + transmute_to_bytes_vec, ArrayBytes, BytesRepresentation, ChunkRepresentation, CodecChain, + DataType, DataTypeSize, Endianness, FillValue, RawBytes, RawBytesOffsets, }, config::global_config, metadata::v3::{array::codec::vlen::VlenIndexDataType, MetadataV3}, @@ -21,7 +21,7 @@ use crate::array::codec::{AsyncArrayPartialDecoderTraits, AsyncBytesPartialDecod use super::{vlen_partial_decoder, VlenCodecConfiguration, VlenCodecConfigurationV1}; -/// A `bytes` codec implementation. +/// A `vlen` codec implementation. #[derive(Debug, Clone)] pub struct VlenCodec { index_codecs: Arc, @@ -83,7 +83,7 @@ impl VlenCodec { } impl CodecTraits for VlenCodec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let configuration = VlenCodecConfigurationV1 { index_codecs: self.index_codecs.create_metadatas(), data_codecs: self.data_codecs.create_metadatas(), @@ -265,14 +265,16 @@ impl ArrayToBytesCodecTraits for VlenCodec { } } .unwrap(); - let (data, index) = super::get_vlen_bytes_and_offsets( + let (bytes, offsets) = super::get_vlen_bytes_and_offsets( &index_chunk_rep, &bytes, &self.index_codecs, &self.data_codecs, options, )?; - Ok(ArrayBytes::new_vlen(data, index)) + let offsets = RawBytesOffsets::new(offsets)?; + let array_bytes = ArrayBytes::new_vlen(bytes, offsets)?; + Ok(array_bytes) } fn partial_decoder( diff --git a/zarrs/src/array/codec/array_to_bytes/vlen_v2.rs b/zarrs/src/array/codec/array_to_bytes/vlen_v2.rs index 5a36eb96..e22a0d28 100644 --- a/zarrs/src/array/codec/array_to_bytes/vlen_v2.rs +++ b/zarrs/src/array/codec/array_to_bytes/vlen_v2.rs @@ -5,13 +5,16 @@ mod vlen_v2_partial_decoder; pub(crate) mod vlen_v2_macros; -use std::{mem::size_of, sync::Arc}; +use std::sync::Arc; /// The identifier for the `vlen_v2` codec. pub(crate) const IDENTIFIER: &str = "vlen_v2"; // pub use vlen_v2::IDENTIFIER; -use crate::array::{codec::CodecError, RawBytes}; +use crate::array::{ + codec::{CodecError, InvalidBytesLengthError}, + RawBytes, +}; pub(crate) use vlen_v2_codec::VlenV2Codec; @@ -67,10 +70,7 @@ fn get_interleaved_bytes_and_offsets( // Validate the bytes is long enough to contain header and element lengths let header_length = size_of::() * (1 + num_elements); if bytes.len() < header_length { - return Err(CodecError::UnexpectedChunkDecodedSize( - bytes.len(), - header_length as u64, - )); + return Err(InvalidBytesLengthError::new(bytes.len(), header_length).into()); } // Validate the number of elements from the header diff --git a/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs b/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs index dd781f67..e0bcbef3 100644 --- a/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs +++ b/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs @@ -1,4 +1,4 @@ -use std::{mem::size_of, sync::Arc}; +use std::sync::Arc; use itertools::Itertools; @@ -7,11 +7,11 @@ use crate::{ codec::{ ArrayCodecTraits, ArrayPartialDecoderTraits, ArrayPartialEncoderDefault, ArrayPartialEncoderTraits, ArrayToBytesCodecTraits, BytesPartialDecoderTraits, - BytesPartialEncoderTraits, CodecError, CodecOptions, CodecTraits, + BytesPartialEncoderTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, RecommendedConcurrency, }, - ArrayBytes, ArrayMetadataOptions, BytesRepresentation, ChunkRepresentation, DataTypeSize, - RawBytes, + ArrayBytes, BytesRepresentation, ChunkRepresentation, DataTypeSize, RawBytes, + RawBytesOffsets, }, config::global_config, metadata::v3::MetadataV3, @@ -35,7 +35,7 @@ impl VlenV2Codec { } impl CodecTraits for VlenV2Codec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let config = global_config(); let name = config .experimental_codec_names() @@ -111,7 +111,9 @@ impl ArrayToBytesCodecTraits for VlenV2Codec { ) -> Result, CodecError> { let num_elements = decoded_representation.num_elements_usize(); let (bytes, offsets) = super::get_interleaved_bytes_and_offsets(num_elements, &bytes)?; - Ok(ArrayBytes::new_vlen(bytes, offsets)) + let offsets = RawBytesOffsets::new(offsets)?; + let array_bytes = ArrayBytes::new_vlen(bytes, offsets)?; + Ok(array_bytes) } fn partial_decoder( diff --git a/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_macros.rs b/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_macros.rs index 0ea12587..8a5f35b2 100644 --- a/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_macros.rs +++ b/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_macros.rs @@ -45,23 +45,24 @@ macro_rules! vlen_v2_codec { codec::{ array_to_bytes::vlen_v2::VlenV2Codec, ArrayPartialDecoderTraits, ArrayPartialEncoderTraits, ArrayToBytesCodecTraits, BytesPartialDecoderTraits, - BytesPartialEncoderTraits, CodecError, CodecOptions, CodecTraits, + BytesPartialEncoderTraits, CodecError, CodecMetadataOptions, CodecOptions, + CodecTraits, }, - ArrayBytes, ArrayCodecTraits, ArrayMetadataOptions, BytesRepresentation, - ChunkRepresentation, RawBytes, RecommendedConcurrency, + ArrayBytes, ArrayCodecTraits, BytesRepresentation, ChunkRepresentation, RawBytes, + RecommendedConcurrency, }; #[cfg(feature = "async")] use crate::array::codec::{AsyncArrayPartialDecoderTraits, AsyncBytesPartialDecoderTraits}; - /// The `$identifier` codec implementation. + #[doc = concat!("The `", $identifier, "` codec implementation.")] #[derive(Debug, Clone)] pub struct $struct { inner: Arc, } impl $struct { - /// Create a new `$identifier` codec. + #[doc = concat!("Create a new `", $identifier, "` codec.")] #[must_use] pub fn new() -> Self { Self { @@ -77,7 +78,7 @@ macro_rules! vlen_v2_codec { } impl CodecTraits for $struct { - fn create_metadata_opt(&self, options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, options: &CodecMetadataOptions) -> Option { self.inner.create_metadata_opt(options) } diff --git a/zarrs/src/array/codec/array_to_bytes/zfp.rs b/zarrs/src/array/codec/array_to_bytes/zfp.rs index 8c48c08d..5ccdf3be 100644 --- a/zarrs/src/array/codec/array_to_bytes/zfp.rs +++ b/zarrs/src/array/codec/array_to_bytes/zfp.rs @@ -554,7 +554,7 @@ mod tests { .map(|bytes| bytes.into_fixed().unwrap().to_vec()) .flatten() .collect::>() - .chunks(std::mem::size_of::()) + .chunks(size_of::()) .map(|b| f32::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![ @@ -614,7 +614,7 @@ mod tests { .map(|bytes| bytes.into_fixed().unwrap().to_vec()) .flatten() .collect::>() - .chunks(std::mem::size_of::()) + .chunks(size_of::()) .map(|b| f32::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![ diff --git a/zarrs/src/array/codec/array_to_bytes/zfp/zfp_codec.rs b/zarrs/src/array/codec/array_to_bytes/zfp/zfp_codec.rs index 8a7a1406..81294871 100644 --- a/zarrs/src/array/codec/array_to_bytes/zfp/zfp_codec.rs +++ b/zarrs/src/array/codec/array_to_bytes/zfp/zfp_codec.rs @@ -15,10 +15,10 @@ use crate::{ codec::{ ArrayBytes, ArrayCodecTraits, ArrayPartialDecoderTraits, ArrayPartialEncoderDefault, ArrayPartialEncoderTraits, ArrayToBytesCodecTraits, BytesPartialDecoderTraits, - BytesPartialEncoderTraits, CodecError, CodecOptions, CodecTraits, RawBytes, - RecommendedConcurrency, + BytesPartialEncoderTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, + RawBytes, RecommendedConcurrency, }, - ArrayMetadataOptions, BytesRepresentation, ChunkRepresentation, DataType, + BytesRepresentation, ChunkRepresentation, DataType, }, config::global_config, metadata::v3::{array::codec::zfp::ZfpMode, MetadataV3}, @@ -129,7 +129,7 @@ impl ZfpCodec { } impl CodecTraits for ZfpCodec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let configuration = ZfpCodecConfigurationV1 { write_header: Some(self.write_header), mode: self.mode, diff --git a/zarrs/src/array/codec/bytes_to_bytes/blosc.rs b/zarrs/src/array/codec/bytes_to_bytes/blosc.rs index 9ad54c9f..b8a00612 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/blosc.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/blosc.rs @@ -127,11 +127,7 @@ fn blosc_compress_bytes( fn blosc_validate(src: &[u8]) -> Option { let mut destsize: usize = 0; let valid = unsafe { - blosc_cbuffer_validate( - src.as_ptr().cast::(), - src.len(), - std::ptr::addr_of_mut!(destsize), - ) + blosc_cbuffer_validate(src.as_ptr().cast::(), src.len(), &raw mut destsize) } == 0; valid.then_some(destsize) } @@ -145,8 +141,8 @@ fn blosc_typesize(src: &[u8]) -> Option { unsafe { blosc_cbuffer_metainfo( src.as_ptr().cast::(), - std::ptr::addr_of_mut!(typesize), - std::ptr::addr_of_mut!(flags), + &raw mut typesize, + &raw mut flags, ); }; (typesize != 0).then_some(typesize) @@ -164,9 +160,9 @@ fn blosc_nbytes(src: &[u8]) -> Option { unsafe { blosc_cbuffer_sizes( src.as_ptr().cast::(), - std::ptr::addr_of_mut!(uncompressed_bytes), - std::ptr::addr_of_mut!(cbytes), - std::ptr::addr_of_mut!(blocksize), + &raw mut uncompressed_bytes, + &raw mut cbytes, + &raw mut blocksize, ); }; (uncompressed_bytes > 0 && cbytes > 0 && blocksize > 0).then_some(uncompressed_bytes) @@ -379,7 +375,7 @@ mod tests { let decoded: Vec = decoded .to_vec() - .chunks_exact(std::mem::size_of::()) + .chunks_exact(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); @@ -428,7 +424,7 @@ mod tests { let decoded: Vec = decoded .to_vec() - .chunks_exact(std::mem::size_of::()) + .chunks_exact(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); diff --git a/zarrs/src/array/codec/bytes_to_bytes/blosc/blosc_codec.rs b/zarrs/src/array/codec/bytes_to_bytes/blosc/blosc_codec.rs index 62972cd9..afd1bdb0 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/blosc/blosc_codec.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/blosc/blosc_codec.rs @@ -6,9 +6,10 @@ use crate::{ array::{ codec::{ BytesPartialDecoderTraits, BytesPartialEncoderDefault, BytesPartialEncoderTraits, - BytesToBytesCodecTraits, CodecError, CodecOptions, CodecTraits, RecommendedConcurrency, + BytesToBytesCodecTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, + RecommendedConcurrency, }, - ArrayMetadataOptions, BytesRepresentation, RawBytes, + BytesRepresentation, RawBytes, }, metadata::v3::MetadataV3, plugin::PluginCreateError, @@ -131,7 +132,7 @@ impl BloscCodec { } impl CodecTraits for BloscCodec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let configuration = BloscCodecConfigurationV1 { cname: self.cname, clevel: self.clevel, diff --git a/zarrs/src/array/codec/bytes_to_bytes/bz2.rs b/zarrs/src/array/codec/bytes_to_bytes/bz2.rs index 58eff05a..6a182f49 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/bz2.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/bz2.rs @@ -126,7 +126,7 @@ mod tests { let decoded: Vec = decoded .to_vec() - .chunks_exact(std::mem::size_of::()) + .chunks_exact(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); @@ -174,7 +174,7 @@ mod tests { let decoded: Vec = decoded .to_vec() - .chunks_exact(std::mem::size_of::()) + .chunks_exact(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); diff --git a/zarrs/src/array/codec/bytes_to_bytes/bz2/bz2_codec.rs b/zarrs/src/array/codec/bytes_to_bytes/bz2/bz2_codec.rs index d1380c3e..e967e681 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/bz2/bz2_codec.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/bz2/bz2_codec.rs @@ -8,9 +8,10 @@ use crate::{ array::{ codec::{ BytesPartialDecoderTraits, BytesPartialEncoderDefault, BytesPartialEncoderTraits, - BytesToBytesCodecTraits, CodecError, CodecOptions, CodecTraits, RecommendedConcurrency, + BytesToBytesCodecTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, + RecommendedConcurrency, }, - ArrayMetadataOptions, BytesRepresentation, RawBytes, + BytesRepresentation, RawBytes, }, config::global_config, metadata::v3::MetadataV3, @@ -46,7 +47,7 @@ impl Bz2Codec { } impl CodecTraits for Bz2Codec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let configuration = Bz2CodecConfigurationV1 { level: Bz2CompressionLevel::try_from(self.compression.level()) .expect("checked on init"), diff --git a/zarrs/src/array/codec/bytes_to_bytes/crc32c.rs b/zarrs/src/array/codec/bytes_to_bytes/crc32c.rs index e95ff365..7f8667cd 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/crc32c.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/crc32c.rs @@ -38,7 +38,7 @@ pub(crate) fn create_codec_crc32c(metadata: &MetadataV3) -> Result(); +const CHECKSUM_SIZE: usize = size_of::(); #[cfg(test)] mod tests { @@ -89,8 +89,7 @@ mod tests { assert_eq!(bytes, decoded.to_vec()); // Check that the checksum is correct - let checksum: &[u8; 4] = &encoded - [encoded.len() - core::mem::size_of::()..encoded.len()] + let checksum: &[u8; 4] = &encoded[encoded.len() - size_of::()..encoded.len()] .try_into() .unwrap(); println!("checksum {checksum:?}"); diff --git a/zarrs/src/array/codec/bytes_to_bytes/crc32c/crc32c_codec.rs b/zarrs/src/array/codec/bytes_to_bytes/crc32c/crc32c_codec.rs index 7e88534b..47d1cfc6 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/crc32c/crc32c_codec.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/crc32c/crc32c_codec.rs @@ -5,9 +5,10 @@ use crate::{ codec::{ bytes_to_bytes::strip_suffix_partial_decoder::StripSuffixPartialDecoder, BytesPartialDecoderTraits, BytesPartialEncoderDefault, BytesPartialEncoderTraits, - BytesToBytesCodecTraits, CodecError, CodecOptions, CodecTraits, RecommendedConcurrency, + BytesToBytesCodecTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, + RecommendedConcurrency, }, - ArrayMetadataOptions, BytesRepresentation, RawBytes, + BytesRepresentation, RawBytes, }, metadata::v3::MetadataV3, }; @@ -39,7 +40,7 @@ impl Crc32cCodec { } impl CodecTraits for Crc32cCodec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let configuration = Crc32cCodecConfigurationV1 {}; Some(MetadataV3::new_with_serializable_configuration(IDENTIFIER, &configuration).unwrap()) } diff --git a/zarrs/src/array/codec/bytes_to_bytes/fletcher32.rs b/zarrs/src/array/codec/bytes_to_bytes/fletcher32.rs index 74411496..159ed82b 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/fletcher32.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/fletcher32.rs @@ -53,7 +53,7 @@ pub(crate) fn create_codec_fletcher32(metadata: &MetadataV3) -> Result(); +const CHECKSUM_SIZE: usize = size_of::(); #[cfg(test)] mod tests { @@ -106,8 +106,7 @@ mod tests { assert_eq!(bytes, decoded.to_vec()); // Check that the checksum is correct - let checksum: &[u8; 4] = &encoded - [encoded.len() - core::mem::size_of::()..encoded.len()] + let checksum: &[u8; 4] = &encoded[encoded.len() - size_of::()..encoded.len()] .try_into() .unwrap(); println!("checksum {checksum:?}"); diff --git a/zarrs/src/array/codec/bytes_to_bytes/fletcher32/fletcher32_codec.rs b/zarrs/src/array/codec/bytes_to_bytes/fletcher32/fletcher32_codec.rs index 04e319f6..0944ea56 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/fletcher32/fletcher32_codec.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/fletcher32/fletcher32_codec.rs @@ -7,9 +7,10 @@ use crate::{ codec::{ bytes_to_bytes::strip_suffix_partial_decoder::StripSuffixPartialDecoder, BytesPartialDecoderTraits, BytesPartialEncoderDefault, BytesPartialEncoderTraits, - BytesToBytesCodecTraits, CodecError, CodecOptions, CodecTraits, RecommendedConcurrency, + BytesToBytesCodecTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, + RecommendedConcurrency, }, - ArrayMetadataOptions, BytesRepresentation, RawBytes, + BytesRepresentation, RawBytes, }, metadata::v3::MetadataV3, }; @@ -43,7 +44,7 @@ impl Fletcher32Codec { } impl CodecTraits for Fletcher32Codec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let configuration = Fletcher32CodecConfigurationV1 {}; Some(MetadataV3::new_with_serializable_configuration(IDENTIFIER, &configuration).unwrap()) } diff --git a/zarrs/src/array/codec/bytes_to_bytes/gdeflate.rs b/zarrs/src/array/codec/bytes_to_bytes/gdeflate.rs index 9a14cf0c..13ade3f2 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/gdeflate.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/gdeflate.rs @@ -28,7 +28,7 @@ pub use gdeflate_codec::GDeflateCodec; use crate::{ array::{ - codec::{Codec, CodecError, CodecPlugin}, + codec::{Codec, CodecError, CodecPlugin, InvalidBytesLengthError}, RawBytes, }, metadata::v3::{array::codec::gdeflate, MetadataV3}, @@ -37,7 +37,6 @@ use crate::{ pub use gdeflate::IDENTIFIER; -use core::mem::size_of; use std::sync::Arc; // Register the codec. @@ -62,10 +61,11 @@ const GDEFLATE_STATIC_HEADER_LENGTH: usize = 2 * size_of::(); fn gdeflate_decode(encoded_value: &RawBytes<'_>) -> Result, CodecError> { if encoded_value.len() < GDEFLATE_STATIC_HEADER_LENGTH { - return Err(CodecError::UnexpectedChunkDecodedSize( + return Err(InvalidBytesLengthError::new( encoded_value.len(), - GDEFLATE_STATIC_HEADER_LENGTH as u64, - )); + GDEFLATE_STATIC_HEADER_LENGTH, + ) + .into()); } // Decode the static header @@ -78,10 +78,11 @@ fn gdeflate_decode(encoded_value: &RawBytes<'_>) -> Result, CodecError> // Check length of dynamic header let dynamic_header_length = num_pages * size_of::(); if encoded_value.len() < GDEFLATE_STATIC_HEADER_LENGTH + dynamic_header_length { - return Err(CodecError::UnexpectedChunkDecodedSize( + return Err(InvalidBytesLengthError::new( encoded_value.len(), - (GDEFLATE_STATIC_HEADER_LENGTH + dynamic_header_length) as u64, - )); + GDEFLATE_STATIC_HEADER_LENGTH + dynamic_header_length, + ) + .into()); } // Decode the pages @@ -329,7 +330,7 @@ mod tests { let decoded_partial_chunk: Vec = decoded_partial_chunk .to_vec() - .chunks_exact(std::mem::size_of::()) + .chunks_exact(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![2, 3, 5]; @@ -372,7 +373,7 @@ mod tests { let decoded_partial_chunk: Vec = decoded_partial_chunk .to_vec() - .chunks_exact(std::mem::size_of::()) + .chunks_exact(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![2, 3, 5]; diff --git a/zarrs/src/array/codec/bytes_to_bytes/gdeflate/gdeflate_codec.rs b/zarrs/src/array/codec/bytes_to_bytes/gdeflate/gdeflate_codec.rs index a26dc588..8d969abd 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/gdeflate/gdeflate_codec.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/gdeflate/gdeflate_codec.rs @@ -1,13 +1,12 @@ -use core::mem::size_of; use std::{borrow::Cow, sync::Arc}; use crate::{ array::{ codec::{ BytesPartialDecoderTraits, BytesPartialEncoderDefault, BytesPartialEncoderTraits, - BytesToBytesCodecTraits, CodecError, CodecOptions, CodecTraits, + BytesToBytesCodecTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, }, - ArrayMetadataOptions, BytesRepresentation, RawBytes, RecommendedConcurrency, + BytesRepresentation, RawBytes, RecommendedConcurrency, }, metadata::v3::MetadataV3, }; @@ -48,7 +47,7 @@ impl GDeflateCodec { } impl CodecTraits for GDeflateCodec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let configuration = GDeflateCodecConfigurationV1 { level: self.compression_level, }; diff --git a/zarrs/src/array/codec/bytes_to_bytes/gzip.rs b/zarrs/src/array/codec/bytes_to_bytes/gzip.rs index f9dab455..7be5438e 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/gzip.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/gzip.rs @@ -129,7 +129,7 @@ mod tests { let decoded_partial_chunk: Vec = decoded_partial_chunk .to_vec() - .chunks_exact(std::mem::size_of::()) + .chunks_exact(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![2, 3, 5]; @@ -171,7 +171,7 @@ mod tests { let decoded_partial_chunk: Vec = decoded_partial_chunk .to_vec() - .chunks_exact(std::mem::size_of::()) + .chunks_exact(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![2, 3, 5]; diff --git a/zarrs/src/array/codec/bytes_to_bytes/gzip/gzip_codec.rs b/zarrs/src/array/codec/bytes_to_bytes/gzip/gzip_codec.rs index af57ada4..aa249c68 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/gzip/gzip_codec.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/gzip/gzip_codec.rs @@ -10,9 +10,10 @@ use crate::{ array::{ codec::{ BytesPartialDecoderTraits, BytesPartialEncoderDefault, BytesPartialEncoderTraits, - BytesToBytesCodecTraits, CodecError, CodecOptions, CodecTraits, RecommendedConcurrency, + BytesToBytesCodecTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, + RecommendedConcurrency, }, - ArrayMetadataOptions, BytesRepresentation, RawBytes, + BytesRepresentation, RawBytes, }, metadata::v3::MetadataV3, }; @@ -52,7 +53,7 @@ impl GzipCodec { } impl CodecTraits for GzipCodec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let configuration = GzipCodecConfigurationV1 { level: self.compression_level, }; diff --git a/zarrs/src/array/codec/bytes_to_bytes/test_unbounded.rs b/zarrs/src/array/codec/bytes_to_bytes/test_unbounded.rs index 8336d9ba..90b379df 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/test_unbounded.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/test_unbounded.rs @@ -69,7 +69,7 @@ mod tests { let decoded_partial_chunk: Vec = decoded_partial_chunk .to_vec() - .chunks_exact(std::mem::size_of::()) + .chunks_exact(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![2, 3, 5]; @@ -112,7 +112,7 @@ mod tests { let decoded_partial_chunk: Vec = decoded_partial_chunk .to_vec() - .chunks_exact(std::mem::size_of::()) + .chunks_exact(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![2, 3, 5]; diff --git a/zarrs/src/array/codec/bytes_to_bytes/test_unbounded/test_unbounded_codec.rs b/zarrs/src/array/codec/bytes_to_bytes/test_unbounded/test_unbounded_codec.rs index ee419d23..4166a6ca 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/test_unbounded/test_unbounded_codec.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/test_unbounded/test_unbounded_codec.rs @@ -4,9 +4,10 @@ use crate::{ array::{ codec::{ BytesPartialDecoderTraits, BytesPartialEncoderDefault, BytesPartialEncoderTraits, - BytesToBytesCodecTraits, CodecError, CodecOptions, CodecTraits, RecommendedConcurrency, + BytesToBytesCodecTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, + RecommendedConcurrency, }, - ArrayMetadataOptions, BytesRepresentation, RawBytes, + BytesRepresentation, RawBytes, }, metadata::v3::MetadataV3, }; @@ -31,7 +32,7 @@ impl TestUnboundedCodec { } impl CodecTraits for TestUnboundedCodec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { None } diff --git a/zarrs/src/array/codec/bytes_to_bytes/zstd.rs b/zarrs/src/array/codec/bytes_to_bytes/zstd.rs index 5e4ed1b5..88192699 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/zstd.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/zstd.rs @@ -114,7 +114,7 @@ mod tests { let decoded_partial_chunk: Vec = decoded_partial_chunk .to_vec() - .chunks_exact(std::mem::size_of::()) + .chunks_exact(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![2, 3, 5]; @@ -157,7 +157,7 @@ mod tests { let decoded_partial_chunk: Vec = decoded_partial_chunk .to_vec() - .chunks_exact(std::mem::size_of::()) + .chunks_exact(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![2, 3, 5]; diff --git a/zarrs/src/array/codec/bytes_to_bytes/zstd/zstd_codec.rs b/zarrs/src/array/codec/bytes_to_bytes/zstd/zstd_codec.rs index 185b060f..03138c63 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/zstd/zstd_codec.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/zstd/zstd_codec.rs @@ -6,9 +6,10 @@ use crate::{ array::{ codec::{ BytesPartialDecoderTraits, BytesPartialEncoderDefault, BytesPartialEncoderTraits, - BytesToBytesCodecTraits, CodecError, CodecOptions, CodecTraits, RecommendedConcurrency, + BytesToBytesCodecTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, + RecommendedConcurrency, }, - ArrayMetadataOptions, BytesRepresentation, RawBytes, + BytesRepresentation, RawBytes, }, metadata::v3::MetadataV3, }; @@ -47,7 +48,7 @@ impl ZstdCodec { } impl CodecTraits for ZstdCodec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let configuration = ZstdCodecConfigurationV1 { level: self.compression.into(), checksum: self.checksum, diff --git a/zarrs/src/array/codec/metadata_options.rs b/zarrs/src/array/codec/metadata_options.rs new file mode 100644 index 00000000..7a0a94c2 --- /dev/null +++ b/zarrs/src/array/codec/metadata_options.rs @@ -0,0 +1,39 @@ +//! Codec metadata options. + +/// Options for codec metadata. +#[derive(Debug, Clone, Default)] +pub struct CodecMetadataOptions { + experimental_codec_store_metadata_if_encode_only: bool, +} + +// impl Default for CodecMetadataOptions { +// fn default() -> Self { +// Self { +// experimental_codec_store_metadata_if_encode_only: false, +// } +// } +// } + +impl CodecMetadataOptions { + /// Return the [experimental codec store metadata if encode only](crate::config::Config#experimental-codec-store-metadata-if-encode-only) setting. + #[must_use] + pub fn experimental_codec_store_metadata_if_encode_only(&self) -> bool { + self.experimental_codec_store_metadata_if_encode_only + } + + /// Set the [experimental codec store metadata if encode only](crate::config::Config#experimental-codec-store-metadata-if-encode-only) setting. + #[must_use] + pub fn with_experimental_codec_store_metadata_if_encode_only(mut self, enabled: bool) -> Self { + self.experimental_codec_store_metadata_if_encode_only = enabled; + self + } + + /// Set the [experimental codec store metadata if encode only](crate::config::Config#experimental-codec-store-metadata-if-encode-only) setting. + pub fn set_experimental_codec_store_metadata_if_encode_only( + &mut self, + enabled: bool, + ) -> &mut Self { + self.experimental_codec_store_metadata_if_encode_only = enabled; + self + } +} diff --git a/zarrs/src/array/element.rs b/zarrs/src/array/element.rs index c28c8612..aa826358 100644 --- a/zarrs/src/array/element.rs +++ b/zarrs/src/array/element.rs @@ -3,7 +3,9 @@ use std::mem::ManuallyDrop; use itertools::Itertools; use ArrayError::IncompatibleElementType as IET; -use super::{convert_from_bytes_slice, transmute_to_bytes, ArrayBytes, ArrayError, DataType}; +use super::{ + convert_from_bytes_slice, transmute_to_bytes, ArrayBytes, ArrayError, DataType, RawBytesOffsets, +}; /// A trait representing an array element type. pub trait Element: Sized + Clone { @@ -184,13 +186,21 @@ macro_rules! impl_element_string { len = len.checked_add(element.len()).unwrap(); } offsets.push(len); + let offsets = unsafe { + // SAFETY: The offsets are monotonically increasing. + RawBytesOffsets::new_unchecked(offsets) + }; // Concatenate bytes let mut bytes = Vec::with_capacity(usize::try_from(len).unwrap()); for element in elements { bytes.extend_from_slice(element.as_bytes()); } - Ok(ArrayBytes::new_vlen(bytes, offsets)) + let array_bytes = unsafe { + // SAFETY: The last offset is the length of the bytes. + ArrayBytes::new_vlen_unchecked(bytes, offsets) + }; + Ok(array_bytes) } } }; @@ -238,11 +248,19 @@ macro_rules! impl_element_binary { len = len.checked_add(element.len()).unwrap(); } offsets.push(len); + let offsets = unsafe { + // SAFETY: The offsets are monotonically increasing. + RawBytesOffsets::new_unchecked(offsets) + }; // Concatenate bytes let bytes = elements.concat(); - Ok(ArrayBytes::new_vlen(bytes, offsets)) + let array_bytes = unsafe { + // SAFETY: The last offset is the length of the bytes. + ArrayBytes::new_vlen_unchecked(bytes, offsets) + }; + Ok(array_bytes) } } }; diff --git a/zarrs/src/array_subset.rs b/zarrs/src/array_subset.rs index 947c0f99..67c34146 100644 --- a/zarrs/src/array_subset.rs +++ b/zarrs/src/array_subset.rs @@ -565,9 +565,26 @@ impl ArraySubset { } } - /// Returns true if the array subset is within the bounds of `array_shape`. + /// Returns true if this array subset is within the bounds of `subset`. #[must_use] - pub fn inbounds(&self, array_shape: &[u64]) -> bool { + pub fn inbounds(&self, subset: &ArraySubset) -> bool { + if self.dimensionality() != subset.dimensionality() { + return false; + } + + for (self_start, self_shape, other_start, other_shape) in + izip!(self.start(), self.shape(), subset.start(), subset.shape()) + { + if self_start < other_start || self_start + self_shape > other_start + other_shape { + return false; + } + } + true + } + + /// Returns true if the array subset is within the bounds of an `ArraySubset` with zero origin and a shape of `array_shape`. + #[must_use] + pub fn inbounds_shape(&self, array_shape: &[u64]) -> bool { if self.dimensionality() != array_shape.len() { return false; } @@ -634,9 +651,14 @@ mod tests { ArraySubset::new_with_ranges(&[0..4, 1..5]) ); assert!(array_subset0.relative_to(&[1, 1, 1]).is_err()); - assert!(array_subset0.inbounds(&[10, 10])); - assert!(!array_subset0.inbounds(&[2, 2])); - assert!(!array_subset0.inbounds(&[10, 10, 10])); + assert!(array_subset0.inbounds_shape(&[10, 10])); + assert!(!array_subset0.inbounds_shape(&[2, 2])); + assert!(!array_subset0.inbounds_shape(&[10, 10, 10])); + assert!(array_subset0.inbounds(&ArraySubset::new_with_ranges(&[0..6, 1..7]))); + assert!(array_subset0.inbounds(&ArraySubset::new_with_ranges(&[1..5, 2..6]))); + assert!(!array_subset0.inbounds(&ArraySubset::new_with_ranges(&[2..5, 2..6]))); + assert!(!array_subset0.inbounds(&ArraySubset::new_with_ranges(&[1..5, 2..5]))); + assert!(!array_subset0.inbounds(&ArraySubset::new_with_ranges(&[2..5]))); assert_eq!(array_subset0.to_ranges(), vec![1..5, 2..6]); let array_subset2 = ArraySubset::new_with_ranges(&[3..6, 4..7, 0..1]); diff --git a/zarrs/src/config.rs b/zarrs/src/config.rs index ec0a985f..f89410fb 100644 --- a/zarrs/src/config.rs +++ b/zarrs/src/config.rs @@ -5,7 +5,7 @@ use crate::metadata::v3::array::codec; use std::{ collections::HashMap, - sync::{OnceLock, RwLock, RwLockReadGuard, RwLockWriteGuard}, + sync::{LazyLock, RwLock, RwLockReadGuard, RwLockWriteGuard}, }; #[cfg(doc)] @@ -54,6 +54,14 @@ use crate::array::{codec::CodecOptions, ArrayMetadataOptions}; /// This option sets the preferred minimum chunk concurrency. /// The concurrency of internal codecs is adjusted to accomodate for the chunk concurrency in accordance with the concurrent target set in the [`CodecOptions`] parameter of an encode or decode method. /// +/// ### Experimental Partial Encoding +/// > default: [`false`] +/// +/// If `true`, [`Array::store_chunk_subset`](crate::array::Array::store_chunk_subset) and [`Array::store_array_subset`](crate::array::Array::store_array_subset) and variants can use partial encoding. +/// This is relevant when using the sharding codec, as it enables inner chunks to be written without reading and writing entire shards. +/// +/// This is an experimental feature for now until it has more comprehensively tested and support is added in the async API. +/// /// ## Metadata Options /// /// ### Experimental Codec Store Metadata If Encode Only @@ -96,14 +104,6 @@ use crate::array::{codec::CodecOptions, ArrayMetadataOptions}; /// /// Sets the names used when serialising and deserialising the names of experimental codecs. /// Deserialisation also accepts the standard `IDENTIFIER` of the codec. -/// -/// ### Experimental Partial Encoding -/// > default: [`false`] -/// -/// If `true`, [`Array::store_chunk_subset`](crate::array::Array::store_chunk_subset) and [`Array::store_array_subset`](crate::array::Array::store_array_subset) and variants can use partial encoding. -/// This is relevant when using the sharding codec, as it enables inner chunks to be written without reading and writing entire shards. -/// -/// This is an experimental feature for now until it has more comprehensively tested and support is added in the async API. #[derive(Debug)] #[allow(clippy::struct_excessive_bools)] pub struct Config { @@ -282,17 +282,14 @@ impl Config { } } -static CONFIG: OnceLock> = OnceLock::new(); +static CONFIG: LazyLock> = LazyLock::new(|| RwLock::new(Config::default())); /// Returns a reference to the global `zarrs` configuration. /// /// # Panics /// This function panics if the underlying lock has been poisoned and might panic if the global config is already held by the current thread. pub fn global_config() -> RwLockReadGuard<'static, Config> { - CONFIG - .get_or_init(|| RwLock::new(Config::default())) - .read() - .unwrap() + CONFIG.read().unwrap() } /// Returns a mutable reference to the global `zarrs` configuration. @@ -300,10 +297,7 @@ pub fn global_config() -> RwLockReadGuard<'static, Config> { /// # Panics /// This function panics if the underlying lock has been poisoned and might panic if the global config is already held by the current thread. pub fn global_config_mut() -> RwLockWriteGuard<'static, Config> { - CONFIG - .get_or_init(|| RwLock::new(Config::default())) - .write() - .unwrap() + CONFIG.write().unwrap() } /// The metadata version to retrieve. diff --git a/zarrs/src/group.rs b/zarrs/src/group.rs index 54d04380..f3e8faf0 100644 --- a/zarrs/src/group.rs +++ b/zarrs/src/group.rs @@ -754,8 +754,8 @@ mod tests { } #[test] - fn group_metadata_invalid_additional_field() { - let group_metadata = serde_json::from_str::( + fn group_metadata_unknown_additional_field() { + let group_metadata = serde_json::from_str::( r#"{ "zarr_format": 3, "node_type": "group", @@ -763,10 +763,16 @@ mod tests { "spam": "ham", "eggs": 42 }, - "unknown": "fail" + "unknown": "unsupported" }"#, - ); - assert!(group_metadata.is_err()); + ) + .unwrap(); + assert!(group_metadata.additional_fields.len() == 1); + assert!(group_metadata + .additional_fields + .get("unknown") + .unwrap() + .must_understand()); } #[test] diff --git a/zarrs/src/group/group_builder.rs b/zarrs/src/group/group_builder.rs index 36d13abd..8b0206c8 100644 --- a/zarrs/src/group/group_builder.rs +++ b/zarrs/src/group/group_builder.rs @@ -44,8 +44,7 @@ impl GroupBuilder { /// Set additional fields not defined in the Zarr specification. /// Use this cautiously. In general, store user defined attributes using [`GroupBuilder::attributes`]. /// - /// Note that array metadata must not contain any additional fields, unless they are annotated with `"must_understand": false`. - /// `zarrs` will error when opening an array with additional fields without this annotation. + /// `zarrs` and other implementations are expected to error when opening a group with unsupported additional fields, unless they are a JSON object containing `"must_understand": false`. pub fn additional_fields(&mut self, additional_fields: AdditionalFields) -> &mut Self { match &mut self.metadata { GroupMetadata::V3(metadata) => metadata.additional_fields = additional_fields, diff --git a/zarrs/src/lib.rs b/zarrs/src/lib.rs index af5e04de..2a47456f 100644 --- a/zarrs/src/lib.rs +++ b/zarrs/src/lib.rs @@ -10,27 +10,22 @@ //! //! ## Getting Started //! - Review the [implementation status](#implementation-status), [array support](#array-support), and [storage support](#storage-support). -//! - Read [The `zarrs` Book](https://book.zarrs.dev). +//! - Read [The `zarrs` Book]. //! - View the [examples](https://github.com/LDeakin/zarrs/tree/main/zarrs/examples) and [the example below](#examples). //! - Read the [documentation](https://docs.rs/zarrs/latest/zarrs/). [`array::Array`] is a good place to start. //! - Check out the [`zarrs` ecosystem](#zarrs-ecosystem). //! //! ## Implementation Status //! -#![doc = include_str!("../doc/status/ZEPs.md")] +//! #### Zarr Version Support //! //! `zarrs` has first-class Zarr V3 support and additionally supports a *compatible subset* of Zarr V2 data that: //! - can be converted to V3 with only a metadata change, and //! - uses array metadata that is recognised and supported for encoding/decoding. //! -//! An existing V2 or V3 array can be opened with [`Array::open`](crate::array::Array::open). -//! A new array can be created from V2 or V3 metadata with [`Array::new_with_metadata`](crate::array::Array::new_with_metadata). -//! The [`ArrayBuilder`](crate::array::ArrayBuilder) only supports V3 array creation. +//! `zarrs` supports forward conversion from Zarr V2 to V3. See ["Converting Zarr V2 to V3"](https://book.zarrs.dev/v2_to_v3.html) in [The `zarrs` Book], or try the [`zarrs_reencode`](https://github.com/LDeakin/zarrs_tools/blob/main/docs/zarrs_reencode.md) CLI tool. //! -//! `zarrs` supports forward conversion of Zarr V2 data to V3. -//! See ["Metadata Convert Version"](crate::config::Config#metadata-convert-version) and ["Metadata Erase Version"](crate::config::Config#metadata-erase-version) for information about manipulating the version of array/group metadata. -//! -//! ### Array Support +//! #### Array Support //! //!
Data Types //! @@ -62,7 +57,7 @@ #![doc = include_str!("../doc/status/storage_transformers.md")] //!
//! -//! ### Storage Support +//! #### Storage Support //! //! `zarrs` supports stores (filesystem, HTTP, S3, etc.) via crates implementing the [`zarrs_storage`] API. //! @@ -182,6 +177,8 @@ //! - the MIT license [LICENSE-MIT](https://docs.rs/crate/zarrs/latest/source/LICENCE-MIT) or , at your option. //! //! Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions. +//! +//! [The `zarrs` Book]: https://book.zarrs.dev #![cfg_attr(docsrs, feature(doc_auto_cfg))] pub mod array; @@ -193,6 +190,7 @@ pub mod node; pub mod plugin; pub mod version; +pub use zarrs_data_type as data_type; pub use zarrs_metadata as metadata; pub use zarrs_storage as storage; @@ -205,7 +203,7 @@ pub use storage::byte_range; /// Get a mutable slice of the spare capacity in a vector. fn vec_spare_capacity_to_mut_slice(vec: &mut Vec) -> &mut [T] { let spare_capacity = vec.spare_capacity_mut(); - // SAFETY: `spare_capacity` is valid for both reads and writes for len * mem::size_of::() many bytes, and it is properly aligned + // SAFETY: `spare_capacity` is valid for both reads and writes for len * size_of::() many bytes, and it is properly aligned unsafe { std::slice::from_raw_parts_mut( spare_capacity.as_mut_ptr().cast::(), diff --git a/zarrs/src/node.rs b/zarrs/src/node.rs index 66ee83c6..505c062e 100644 --- a/zarrs/src/node.rs +++ b/zarrs/src/node.rs @@ -322,7 +322,12 @@ impl Node { /// Returns the name of the node. #[must_use] pub fn name(&self) -> NodeName { - let name = self.path.as_str().split('/').last().unwrap_or_default(); + let name = self + .path + .as_str() + .split('/') + .next_back() + .unwrap_or_default(); unsafe { NodeName::new_unchecked(name) } } diff --git a/zarrs/src/plugin.rs b/zarrs/src/plugin.rs index 47486045..ddcc00fe 100644 --- a/zarrs/src/plugin.rs +++ b/zarrs/src/plugin.rs @@ -3,7 +3,8 @@ //! A [`Plugin`] creates objects from [`MetadataV3`] (consisting of a name and optional configuration). //! It is used to implement [Zarr extension points](https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#extension-points), such as [chunk grids][`crate::array::chunk_grid`], [chunk key encodings](`crate::array::chunk_key_encoding`), [codecs](`crate::array::codec`), and [storage transformers](`crate::array::storage_transformer`). //! -//! [Data types](`crate::array::data_type`) are not currently supported as an extension point. +//! [`DataType`](crate::data_type::DataType)s are not currently supported as an extension point. +// FIXME: Data type extensions //! //! Plugins are registered at compile time using the [inventory] crate. //! At runtime, a name matching function is applied to identify which registered plugin is associated with the metadata. diff --git a/zarrs/tests/array_partial_encode.rs b/zarrs/tests/array_partial_encode.rs index 3fb4768d..6af530cc 100644 --- a/zarrs/tests/array_partial_encode.rs +++ b/zarrs/tests/array_partial_encode.rs @@ -3,7 +3,6 @@ use std::sync::Arc; -use core::mem::size_of; use zarrs::{ array::{ codec::{ diff --git a/zarrs/tests/data/v2/array_blosc_F.zarr/0.0 b/zarrs/tests/data/v2/array_blosc_F.zarr/0.0 index 404ac43c..f4101146 100644 Binary files a/zarrs/tests/data/v2/array_blosc_F.zarr/0.0 and b/zarrs/tests/data/v2/array_blosc_F.zarr/0.0 differ diff --git a/zarrs/tests/data/v2/array_blosc_F.zarr/0.1 b/zarrs/tests/data/v2/array_blosc_F.zarr/0.1 index 91cfc088..4343539f 100644 Binary files a/zarrs/tests/data/v2/array_blosc_F.zarr/0.1 and b/zarrs/tests/data/v2/array_blosc_F.zarr/0.1 differ diff --git a/zarrs/tests/data/v2/array_blosc_F.zarr/1.0 b/zarrs/tests/data/v2/array_blosc_F.zarr/1.0 index 8f42c6fe..823b486a 100644 Binary files a/zarrs/tests/data/v2/array_blosc_F.zarr/1.0 and b/zarrs/tests/data/v2/array_blosc_F.zarr/1.0 differ diff --git a/zarrs/tests/data/v2/array_blosc_F.zarr/1.1 b/zarrs/tests/data/v2/array_blosc_F.zarr/1.1 index 55c1b71f..26bceb7a 100644 Binary files a/zarrs/tests/data/v2/array_blosc_F.zarr/1.1 and b/zarrs/tests/data/v2/array_blosc_F.zarr/1.1 differ diff --git a/zarrs/tests/data/v2/array_gzip_C.zarr/0.0 b/zarrs/tests/data/v2/array_gzip_C.zarr/0.0 index f012f727..3feffc8b 100644 Binary files a/zarrs/tests/data/v2/array_gzip_C.zarr/0.0 and b/zarrs/tests/data/v2/array_gzip_C.zarr/0.0 differ diff --git a/zarrs/tests/data/v2/array_gzip_C.zarr/0.1 b/zarrs/tests/data/v2/array_gzip_C.zarr/0.1 index cf03fcce..5d052ed1 100644 Binary files a/zarrs/tests/data/v2/array_gzip_C.zarr/0.1 and b/zarrs/tests/data/v2/array_gzip_C.zarr/0.1 differ diff --git a/zarrs/tests/data/v2/array_gzip_C.zarr/1.0 b/zarrs/tests/data/v2/array_gzip_C.zarr/1.0 index 67e321df..49c02be3 100644 Binary files a/zarrs/tests/data/v2/array_gzip_C.zarr/1.0 and b/zarrs/tests/data/v2/array_gzip_C.zarr/1.0 differ diff --git a/zarrs/tests/data/v2/array_gzip_C.zarr/1.1 b/zarrs/tests/data/v2/array_gzip_C.zarr/1.1 index b80ca555..e1b9a62f 100644 Binary files a/zarrs/tests/data/v2/array_gzip_C.zarr/1.1 and b/zarrs/tests/data/v2/array_gzip_C.zarr/1.1 differ diff --git a/zarrs/tests/data/v2/array_none_F.zarr/0.0 b/zarrs/tests/data/v2/array_none_F.zarr/0.0 index f9dc734c..c1252537 100644 Binary files a/zarrs/tests/data/v2/array_none_F.zarr/0.0 and b/zarrs/tests/data/v2/array_none_F.zarr/0.0 differ diff --git a/zarrs/tests/data/v2/array_none_F.zarr/0.1 b/zarrs/tests/data/v2/array_none_F.zarr/0.1 index 2bcffade..2bc66476 100644 Binary files a/zarrs/tests/data/v2/array_none_F.zarr/0.1 and b/zarrs/tests/data/v2/array_none_F.zarr/0.1 differ diff --git a/zarrs/tests/data/v2/array_none_F.zarr/1.0 b/zarrs/tests/data/v2/array_none_F.zarr/1.0 index b15fa039..638a99a9 100644 Binary files a/zarrs/tests/data/v2/array_none_F.zarr/1.0 and b/zarrs/tests/data/v2/array_none_F.zarr/1.0 differ diff --git a/zarrs/tests/data/v2/array_none_F.zarr/1.1 b/zarrs/tests/data/v2/array_none_F.zarr/1.1 index 3bcfed80..7df5783a 100644 Binary files a/zarrs/tests/data/v2/array_none_F.zarr/1.1 and b/zarrs/tests/data/v2/array_none_F.zarr/1.1 differ diff --git a/zarrs/tests/data/v2_cities.py b/zarrs/tests/data/v2_cities.py index b75a2d80..1b330581 100644 --- a/zarrs/tests/data/v2_cities.py +++ b/zarrs/tests/data/v2_cities.py @@ -1,12 +1,29 @@ +#!/usr/bin/env -S uv run +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "zarr==3.0.1", +# "pandas==2.2.3" +# ] +# /// + import zarr import pandas as pd -print(zarr.__version__) # This was generated with zarr==2.18 - df = pd.read_csv("tests/data/cities.csv", header=None) cities = df[0] -path_out = 'tests/data/zarr_python_compat/cities_v2.zarr' -array = zarr.open(path_out, mode='w', dtype=str, shape=(len(cities),), chunks=(1000,), compressor = None, fill_value='') +path_out = "tests/data/zarr_python_compat/cities_v2.zarr" +array = zarr.create_array( + path_out, + dtype=str, + shape=(len(cities),), + chunks=(1000,), + filters=zarr.codecs.vlen_utf8.VLenUTF8(), + compressors=[None], + # fill_value="", + zarr_format=2, + overwrite=True, +) array[:] = cities.values print(array.info) diff --git a/zarrs/tests/data/v2_generate.py b/zarrs/tests/data/v2_generate.py old mode 100644 new mode 100755 index caad33e6..480c1867 --- a/zarrs/tests/data/v2_generate.py +++ b/zarrs/tests/data/v2_generate.py @@ -1,3 +1,14 @@ +#!/usr/bin/env -S uv run +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "zarr==3.0.1", +# "numcodecs==0.15.0", +# "zfpy==1.0.1", +# "pcodec==0.3.2", +# ] +# /// + import zarr import numpy as np from numcodecs import Blosc, GZip, BZ2, ZFPY, PCodec, Zstd diff --git a/zarrs/tests/data/v2_str0.py b/zarrs/tests/data/v2_str0.py new file mode 100755 index 00000000..162724ee --- /dev/null +++ b/zarrs/tests/data/v2_str0.py @@ -0,0 +1,42 @@ +#!/usr/bin/env -S uv run +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "zarr==3.0.1", +# ] +# /// + +import zarr + +path_out = "tests/data/zarr_python_compat/str_v2_fv_0.zarr" +array = zarr.create_array( + path_out, + dtype=str, + shape=(5,), + chunks=(2,), + filters=zarr.codecs.vlen_utf8.VLenUTF8(), + compressors=[None], + fill_value=0, + zarr_format=2, + overwrite=True, +) +array[:3] = ["a", "bb", ""] +print(array.info) +# assert (array[:] == ["a", "bb", "", "", ""]).all() # FAILURE + +path_out = "tests/data/zarr_python_compat/str_v2_fv_null.zarr" +array = zarr.create_array( + path_out, + dtype=str, + shape=(5,), + chunks=(2,), + filters=zarr.codecs.vlen_utf8.VLenUTF8(), + compressors=[None], + fill_value=None, + zarr_format=2, + overwrite=True, +) +array[:3] = ["a", "bb", ""] +print(array.info) +print(array[:]) +assert (array[:] == ["a", "bb", "", "", ""]).all() \ No newline at end of file diff --git a/zarrs/tests/data/v3/array_blosc_transpose.zarr/.zarray b/zarrs/tests/data/v3/array_blosc_transpose.zarr/.zarray index d5bdb34a..8bfa6f0e 100644 --- a/zarrs/tests/data/v3/array_blosc_transpose.zarr/.zarray +++ b/zarrs/tests/data/v3/array_blosc_transpose.zarr/.zarray @@ -12,10 +12,10 @@ "dtype": " 0.15.0 +# "zfpy==1.0.1", +# "pcodec==0.3.2", +# ] +# /// + +import zarr import numpy as np -from numcodecs.zarr3 import BZ2, ZFPY, PCodec # 0.14.2.dev22 with https://github.com/zarr-developers/numcodecs/pull/685 +from numcodecs.zarr3 import BZ2, ZFPY, PCodec compressor_blosc = zarr.codecs.BloscCodec(cname="zstd", clevel=1, shuffle=zarr.codecs.BloscShuffle.bitshuffle) compressor_gzip = zarr.codecs.GzipCodec(level=9) diff --git a/zarrs/tests/data/v3_zarr_python/array_gzip.zarr/c/0/0 b/zarrs/tests/data/v3_zarr_python/array_gzip.zarr/c/0/0 index 5a9f93cb..ed309704 100644 Binary files a/zarrs/tests/data/v3_zarr_python/array_gzip.zarr/c/0/0 and b/zarrs/tests/data/v3_zarr_python/array_gzip.zarr/c/0/0 differ diff --git a/zarrs/tests/data/v3_zarr_python/array_gzip.zarr/c/0/1 b/zarrs/tests/data/v3_zarr_python/array_gzip.zarr/c/0/1 index 64597638..49038743 100644 Binary files a/zarrs/tests/data/v3_zarr_python/array_gzip.zarr/c/0/1 and b/zarrs/tests/data/v3_zarr_python/array_gzip.zarr/c/0/1 differ diff --git a/zarrs/tests/data/v3_zarr_python/array_gzip.zarr/c/1/0 b/zarrs/tests/data/v3_zarr_python/array_gzip.zarr/c/1/0 index 4543b4ee..c5790a72 100644 Binary files a/zarrs/tests/data/v3_zarr_python/array_gzip.zarr/c/1/0 and b/zarrs/tests/data/v3_zarr_python/array_gzip.zarr/c/1/0 differ diff --git a/zarrs/tests/data/v3_zarr_python/array_gzip.zarr/c/1/1 b/zarrs/tests/data/v3_zarr_python/array_gzip.zarr/c/1/1 index a99f64d9..cebc0fb0 100644 Binary files a/zarrs/tests/data/v3_zarr_python/array_gzip.zarr/c/1/1 and b/zarrs/tests/data/v3_zarr_python/array_gzip.zarr/c/1/1 differ diff --git a/zarrs/tests/data/zarr_python_compat/cities_v2.zarr/.zarray b/zarrs/tests/data/zarr_python_compat/cities_v2.zarr/.zarray index f459a409..1337f1cb 100644 --- a/zarrs/tests/data/zarr_python_compat/cities_v2.zarr/.zarray +++ b/zarrs/tests/data/zarr_python_compat/cities_v2.zarr/.zarray @@ -1,18 +1,19 @@ { - "chunks": [ - 1000 - ], - "compressor": null, - "dtype": "|O", - "fill_value": "", - "filters": [ - { - "id": "vlen-utf8" - } - ], - "order": "C", - "shape": [ - 47868 - ], - "zarr_format": 2 + "shape": [ + 47868 + ], + "chunks": [ + 1000 + ], + "fill_value": null, + "order": "C", + "filters": [ + { + "id": "vlen-utf8" + } + ], + "dimension_separator": ".", + "compressor": null, + "zarr_format": 2, + "dtype": "|O" } \ No newline at end of file diff --git a/zarrs/tests/data/zarr_python_compat/cities_v2.zarr/.zattrs b/zarrs/tests/data/zarr_python_compat/cities_v2.zarr/.zattrs new file mode 100644 index 00000000..9e26dfee --- /dev/null +++ b/zarrs/tests/data/zarr_python_compat/cities_v2.zarr/.zattrs @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/zarrs/tests/data/zarr_python_compat/cities_v3.zarr/c/47 b/zarrs/tests/data/zarr_python_compat/cities_v3.zarr/c/47 index 4ee64533..09682c9c 100644 Binary files a/zarrs/tests/data/zarr_python_compat/cities_v3.zarr/c/47 and b/zarrs/tests/data/zarr_python_compat/cities_v3.zarr/c/47 differ diff --git a/zarrs/tests/data/zarr_python_compat/cities_v3.zarr/zarr.json b/zarrs/tests/data/zarr_python_compat/cities_v3.zarr/zarr.json index 30245721..27279d75 100644 --- a/zarrs/tests/data/zarr_python_compat/cities_v3.zarr/zarr.json +++ b/zarrs/tests/data/zarr_python_compat/cities_v3.zarr/zarr.json @@ -1 +1,31 @@ -{"shape": [47868], "data_type": "string", "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": [1000]}}, "chunk_key_encoding": {"name": "default", "configuration": {"separator": "/"}}, "fill_value": "0", "codecs": [{"name": "vlen-utf8", "configuration": {}}], "attributes": {}, "zarr_format": 3, "node_type": "array", "storage_transformers": []} \ No newline at end of file +{ + "shape": [ + 47868 + ], + "data_type": "string", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 1000 + ] + } + }, + "chunk_key_encoding": { + "name": "default", + "configuration": { + "separator": "/" + } + }, + "fill_value": "", + "codecs": [ + { + "name": "vlen-utf8", + "configuration": {} + } + ], + "attributes": {}, + "zarr_format": 3, + "node_type": "array", + "storage_transformers": [] +} \ No newline at end of file diff --git a/zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/.zarray b/zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/.zarray new file mode 100644 index 00000000..aae1beed --- /dev/null +++ b/zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/.zarray @@ -0,0 +1,19 @@ +{ + "shape": [ + 5 + ], + "chunks": [ + 2 + ], + "fill_value": 0, + "order": "C", + "filters": [ + { + "id": "vlen-utf8" + } + ], + "dimension_separator": ".", + "compressor": null, + "zarr_format": 2, + "dtype": "|O" +} \ No newline at end of file diff --git a/zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/.zattrs b/zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/.zattrs new file mode 100644 index 00000000..9e26dfee --- /dev/null +++ b/zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/.zattrs @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/0 b/zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/0 new file mode 100644 index 00000000..72190f21 Binary files /dev/null and b/zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/0 differ diff --git a/zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/1 b/zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/1 new file mode 100644 index 00000000..3ae16874 Binary files /dev/null and b/zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/1 differ diff --git a/zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/.zarray b/zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/.zarray new file mode 100644 index 00000000..a1b39c04 --- /dev/null +++ b/zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/.zarray @@ -0,0 +1,19 @@ +{ + "shape": [ + 5 + ], + "chunks": [ + 2 + ], + "fill_value": null, + "order": "C", + "filters": [ + { + "id": "vlen-utf8" + } + ], + "dimension_separator": ".", + "compressor": null, + "zarr_format": 2, + "dtype": "|O" +} \ No newline at end of file diff --git a/zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/.zattrs b/zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/.zattrs new file mode 100644 index 00000000..9e26dfee --- /dev/null +++ b/zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/.zattrs @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/0 b/zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/0 new file mode 100644 index 00000000..72190f21 Binary files /dev/null and b/zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/0 differ diff --git a/zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/1 b/zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/1 new file mode 100644 index 00000000..3ae16874 Binary files /dev/null and b/zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/1 differ diff --git a/zarrs/tests/zarr_python_compat.rs b/zarrs/tests/zarr_python_compat.rs index 54b4d195..7f1974ca 100644 --- a/zarrs/tests/zarr_python_compat.rs +++ b/zarrs/tests/zarr_python_compat.rs @@ -43,3 +43,31 @@ fn zarr_python_compat_fletcher32_v2() -> Result<(), Box> { Ok(()) } + +#[test] +fn zarr_python_v2_compat_str_fv_0() -> Result<(), Box> { + let store = Arc::new(FilesystemStore::new( + "tests/data/zarr_python_compat/str_v2_fv_0.zarr", + )?); + let array = zarrs::array::Array::open(store.clone(), "/")?; + let subset_all = array.subset_all(); + let elements = array.retrieve_array_subset_elements::(&subset_all)?; + + assert_eq!(elements, &["a", "bb", "", "", ""]); + + Ok(()) +} + +#[test] +fn zarr_python_v2_compat_str_fv_null() -> Result<(), Box> { + let store = Arc::new(FilesystemStore::new( + "tests/data/zarr_python_compat/str_v2_fv_null.zarr", + )?); + let array = zarrs::array::Array::open(store.clone(), "/")?; + let subset_all = array.subset_all(); + let elements = array.retrieve_array_subset_elements::(&subset_all)?; + + assert_eq!(elements, &["a", "bb", "", "", ""]); + + Ok(()) +} diff --git a/zarrs_data_type/CHANGELOG.md b/zarrs_data_type/CHANGELOG.md new file mode 100644 index 00000000..91bd6bbf --- /dev/null +++ b/zarrs_data_type/CHANGELOG.md @@ -0,0 +1,17 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +## [0.1.0] - 2025-01-24 + +### Added +- Initial release +- Split from the `zarrs::array::{data_type,fill_value}` modules of `zarrs` 0.20.0-dev + +[unreleased]: https://github.com/LDeakin/zarrs/compare/zarrs_data_type-v0.1.0...HEAD +[0.1.0]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_data_type-v0.1.0 diff --git a/zarrs_data_type/Cargo.toml b/zarrs_data_type/Cargo.toml new file mode 100644 index 00000000..cc028d01 --- /dev/null +++ b/zarrs_data_type/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "zarrs_data_type" +version = "0.1.0" +authors = ["Lachlan Deakin "] +edition = "2021" +rust-version = "1.77" +description = "Zarr data types for the zarrs crate" +homepage = "https://zarrs.dev" +documentation = "https://docs.rs/zarrs_data_type" +repository = "https://github.com/LDeakin/zarrs/tree/main/zarrs_data_type" +license = "MIT OR Apache-2.0" +keywords = ["zarr", "zarrs"] +categories = ["encoding"] + +[lints] +workspace = true + +[package.metadata.docs.rs] +all-features = true + +[dependencies] +half = { workspace = true } +num = { workspace = true } +thiserror = "2.0.0" +derive_more = { version = "1.0.0", features = ["display", "from"] } +zarrs_metadata = { workspace = true } + +[dev-dependencies] +serde_json = { version = "1.0.71", features = ["float_roundtrip", "preserve_order"] } +bytemuck = { version = "1.14.0", features = ["extern_crate_alloc", "must_cast", "min_const_generics"] } diff --git a/zarrs_data_type/LICENCE-APACHE b/zarrs_data_type/LICENCE-APACHE new file mode 120000 index 00000000..536a3dbc --- /dev/null +++ b/zarrs_data_type/LICENCE-APACHE @@ -0,0 +1 @@ +../LICENCE-APACHE \ No newline at end of file diff --git a/zarrs_data_type/LICENCE-MIT b/zarrs_data_type/LICENCE-MIT new file mode 120000 index 00000000..e259b4c0 --- /dev/null +++ b/zarrs_data_type/LICENCE-MIT @@ -0,0 +1 @@ +../LICENCE-MIT \ No newline at end of file diff --git a/zarrs_data_type/README.md b/zarrs_data_type/README.md new file mode 100644 index 00000000..abe03ad1 --- /dev/null +++ b/zarrs_data_type/README.md @@ -0,0 +1,15 @@ +# zarrs_data_type + +[![Latest Version](https://img.shields.io/crates/v/zarrs_data_type.svg)](https://crates.io/crates/zarrs_data_type) +[![zarrs_data_type documentation](https://docs.rs/zarrs_data_type/badge.svg)](https://docs.rs/zarrs_data_type) +![msrv](https://img.shields.io/crates/msrv/zarrs_data_type) +[![build](https://github.com/LDeakin/zarrs/actions/workflows/ci.yml/badge.svg)](https://github.com/LDeakin/zarrs/actions/workflows/ci.yml) + +[Zarr](https://zarr-specs.readthedocs.io/) data types for the [`zarrs`](https://crates.io/crates/zarrs) Rust crate. + +## Licence +`zarrs_data_type` is licensed under either of + - the Apache License, Version 2.0 [LICENSE-APACHE](./LICENCE-APACHE) or or + - the MIT license [LICENSE-MIT](./LICENCE-MIT) or , at your option. + +Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions. diff --git a/zarrs/src/array/data_type.rs b/zarrs_data_type/src/data_type.rs similarity index 99% rename from zarrs/src/array/data_type.rs rename to zarrs_data_type/src/data_type.rs index 478a8907..1d0d64da 100644 --- a/zarrs/src/array/data_type.rs +++ b/zarrs_data_type/src/data_type.rs @@ -6,7 +6,7 @@ use derive_more::From; use half::{bf16, f16}; use thiserror::Error; -use crate::metadata::v3::array::{ +use zarrs_metadata::v3::array::{ data_type::{DataTypeMetadataV3, DataTypeSize}, fill_value::{ bfloat16_to_fill_value, float16_to_fill_value, float32_to_fill_value, @@ -53,9 +53,13 @@ pub enum DataType { Complex128, /// `r*` raw bits, variable size given by *, limited to be a multiple of 8. RawBits(usize), // the stored usize is the size in bytes - /// A UTF-8 encoded string. + /// A UTF-8 encoded string. **Experimental**. + /// + /// This data type is not standardised in the Zarr V3 specification. String, - /// Variable-sized binary data. + /// Variable-sized binary data. **Experimental**. + /// + /// This data type is not standardised in the Zarr V3 specification. Bytes, } @@ -352,7 +356,7 @@ impl core::fmt::Display for DataType { mod tests { use super::*; - use crate::metadata::v3::array::{ + use zarrs_metadata::v3::array::{ fill_value::{FillValueFloatStringNonFinite, HexString}, nan_representations::{ZARR_NAN_BF16, ZARR_NAN_F16, ZARR_NAN_F32, ZARR_NAN_F64}, }; diff --git a/zarrs/src/array/fill_value.rs b/zarrs_data_type/src/fill_value.rs similarity index 94% rename from zarrs/src/array/fill_value.rs rename to zarrs_data_type/src/fill_value.rs index f885d8ce..fa4609a9 100644 --- a/zarrs/src/array/fill_value.rs +++ b/zarrs_data_type/src/fill_value.rs @@ -118,7 +118,7 @@ impl From for FillValue { impl From for FillValue { fn from(value: num::complex::Complex32) -> Self { - let mut bytes = Vec::with_capacity(std::mem::size_of::()); + let mut bytes = Vec::with_capacity(size_of::()); bytes.extend(value.re.to_ne_bytes()); bytes.extend(value.im.to_ne_bytes()); Self(bytes) @@ -127,7 +127,7 @@ impl From for FillValue { impl From for FillValue { fn from(value: num::complex::Complex64) -> Self { - let mut bytes = Vec::with_capacity(std::mem::size_of::()); + let mut bytes = Vec::with_capacity(size_of::()); bytes.extend(value.re.to_ne_bytes()); bytes.extend(value.im.to_ne_bytes()); Self(bytes) @@ -233,10 +233,21 @@ impl FillValue { #[cfg(test)] mod tests { - use crate::array::transmute_to_bytes_vec; - use super::*; + /// Convert from `&[T]` to `Vec`. + #[must_use] + fn convert_to_bytes_vec(from: &[T]) -> Vec { + bytemuck::allocation::pod_collect_to_vec(from) + } + + /// Transmute from `Vec` to `Vec`. + #[must_use] + fn transmute_to_bytes_vec(from: Vec) -> Vec { + bytemuck::allocation::try_cast_vec(from) + .unwrap_or_else(|(_err, from)| convert_to_bytes_vec(&from)) + } + #[test] fn fill_value() { assert_eq!( diff --git a/zarrs_data_type/src/lib.rs b/zarrs_data_type/src/lib.rs new file mode 100644 index 00000000..b9a4eeaf --- /dev/null +++ b/zarrs_data_type/src/lib.rs @@ -0,0 +1,10 @@ +//! [Zarr](https://zarr-specs.readthedocs.io/) data types for the [`zarrs`](https://docs.rs/zarrs/latest/zarrs/index.html) crate. + +mod data_type; +mod fill_value; + +pub use data_type::{ + DataType, IncompatibleFillValueError, IncompatibleFillValueMetadataError, + UnsupportedDataTypeError, +}; +pub use fill_value::FillValue; diff --git a/zarrs_filesystem/Cargo.toml b/zarrs_filesystem/Cargo.toml index a26b2cc1..5dd882d5 100644 --- a/zarrs_filesystem/Cargo.toml +++ b/zarrs_filesystem/Cargo.toml @@ -7,7 +7,7 @@ rust-version = "1.77" description = "A filesystem store for the zarrs crate" homepage = "https://zarrs.dev" documentation = "https://docs.rs/zarrs_filesystem" -repository = "https://github.com/LDeakin/zarrs" +repository = "https://github.com/LDeakin/zarrs/tree/main/zarrs_filesystem" license = "MIT OR Apache-2.0" keywords = ["zarr", "zarrs", "storage", "store", "filesystem"] categories = ["encoding"] @@ -21,7 +21,7 @@ derive_more = { version = "1.0.0", features = ["from"] } itertools = "0.14.0" libc = "0.2.158" page_size = "0.6.0" -parking_lot = "0.12.0" +parking_lot = "0.12.0" # TODO: Remove with Rust 1.78+ pathdiff = "0.2.0" thiserror = "2.0.0" walkdir = "2.3.2" diff --git a/zarrs_filesystem/src/lib.rs b/zarrs_filesystem/src/lib.rs index 103756c6..18ecd261 100644 --- a/zarrs_filesystem/src/lib.rs +++ b/zarrs_filesystem/src/lib.rs @@ -15,7 +15,7 @@ use zarrs_storage::{ }; use bytes::BytesMut; -use parking_lot::RwLock; +use parking_lot::RwLock; // TODO: std::sync::RwLock with Rust 1.78+ use thiserror::Error; use walkdir::WalkDir; diff --git a/zarrs_http/CHANGELOG.md b/zarrs_http/CHANGELOG.md index 848702e8..3a6b3984 100644 --- a/zarrs_http/CHANGELOG.md +++ b/zarrs_http/CHANGELOG.md @@ -10,6 +10,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Bump `itertools` to 0.14 +### Fixed +- Fixed crate manifest `documentation` and `keywords` + ## [0.2.0] - 2024-11-15 ### Changed diff --git a/zarrs_http/Cargo.toml b/zarrs_http/Cargo.toml index 68dd11c5..70c22c93 100644 --- a/zarrs_http/Cargo.toml +++ b/zarrs_http/Cargo.toml @@ -6,10 +6,10 @@ edition = "2021" rust-version = "1.77" description = "A synchronous http store for the zarrs crate" homepage = "https://zarrs.dev" -documentation = "https://docs.rs/zarrs_storage" -repository = "https://github.com/LDeakin/zarrs" +documentation = "https://docs.rs/zarrs_http" +repository = "https://github.com/LDeakin/zarrs/tree/main/zarrs_http" license = "MIT OR Apache-2.0" -keywords = ["zarr", "zarrs", "storage", "store"] +keywords = ["zarr", "zarrs", "http"] categories = ["encoding"] [lints] diff --git a/zarrs_metadata/CHANGELOG.md b/zarrs_metadata/CHANGELOG.md index a75d96ac..26c9a3c6 100644 --- a/zarrs_metadata/CHANGELOG.md +++ b/zarrs_metadata/CHANGELOG.md @@ -7,6 +7,36 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.4] - 2025-02-13 + +### Added +- Add `UnsupportedAdditionalFieldError::new` + +### Fixed +- Make `AdditionalField` public and permit any JSON type (not just objects) + +## [0.3.3] - 2025-02-06 + +### Fixed +- Permit string compression levels in `zstd` codec metadata (for `zarr-python` compatibility) +- Use `bytes` codec with native endianness if unset for a Zarr V2 array + +## [0.3.2] - 2025-02-04 + +### Added +- Derive `Copy` for `ArrayMetadataV2Order` +- Add `codec_metadata_v2_to_v3` + +### Fixed +- Interpret a `0` fill value as `""` for Zarr V2 string arrays (for `zarr-python` compatibility) ([#140] by [@zqfang]) + +[#140]: https://github.com/LDeakin/zarrs/pull/140 + +## [0.3.1] - 2025-01-29 + +### Fixed +- Interpret a `null` fill value as `""` for Zarr V2 string arrays (for `zarr-python` compatibility) + ## [0.3.0] - 2025-01-10 ### Added @@ -56,7 +86,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Initial release - Split from the `metadata` module of `zarrs` 0.17.0-dev -[unreleased]: https://github.com/LDeakin/zarrs/compare/zarrs_metadata-v0.3.0...HEAD +[unreleased]: https://github.com/LDeakin/zarrs/compare/zarrs_metadata-v0.3.4...HEAD +[0.3.4]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.3.4 +[0.3.3]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.3.3 +[0.3.2]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.3.2 +[0.3.1]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.3.1 [0.3.0]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.3.0 [0.2.0]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.2.0 [0.1.0]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.1.0 + +[@zqfang]: https://github.com/zqfang diff --git a/zarrs_metadata/Cargo.toml b/zarrs_metadata/Cargo.toml index 940579cf..2efac0da 100644 --- a/zarrs_metadata/Cargo.toml +++ b/zarrs_metadata/Cargo.toml @@ -1,13 +1,13 @@ [package] name = "zarrs_metadata" -version = "0.3.0" +version = "0.3.4" authors = ["Lachlan Deakin "] edition = "2021" rust-version = "1.77" description = "Zarr metadata support for the zarrs crate" homepage = "https://zarrs.dev" -documentation = "https://docs.rs/zarrs_object_store" -repository = "https://github.com/LDeakin/zarrs" +documentation = "https://docs.rs/zarrs_metadata" +repository = "https://github.com/LDeakin/zarrs/tree/main/zarrs_metadata" license = "MIT OR Apache-2.0" keywords = ["zarr", "zarrs", "metadata"] categories = ["encoding"] @@ -17,9 +17,9 @@ workspace = true [dependencies] derive_more = { version = "1.0.0", features = ["display", "from"] } -half = { version = "2.0.0", features = ["bytemuck"] } +half = { workspace = true } monostate = "0.1.0" -num = { version = "0.4.1" } +num = { workspace = true } serde = { version = "1.0.185", features = ["derive"] } serde_json = { version = "1.0.71", features = ["float_roundtrip", "preserve_order"] } serde_repr = "0.1.19" diff --git a/zarrs_metadata/src/lib.rs b/zarrs_metadata/src/lib.rs index fadc3b2e..16577efd 100644 --- a/zarrs_metadata/src/lib.rs +++ b/zarrs_metadata/src/lib.rs @@ -75,7 +75,7 @@ pub enum NodeMetadata { #[cfg(test)] mod tests { use super::*; - use v3::{AdditionalFields, MetadataV3}; + use v3::{AdditionalField, AdditionalFields, MetadataV3}; #[test] fn metadata() { @@ -111,14 +111,27 @@ mod tests { } #[test] - fn additional_fields_auto() { - let mut additional_fields = AdditionalFields::new(); + fn additional_fields_constructors() { let additional_field = serde_json::Map::new(); - additional_fields.insert("key".to_string(), additional_field.into()); - assert!(!additional_fields.contains_key("must_understand")); - assert!(serde_json::to_string(&additional_fields) - .unwrap() - .contains(r#""must_understand":false"#)); + let additional_field: AdditionalField = additional_field.into(); + assert!(additional_field.must_understand()); + assert!( + additional_field.as_value() == &serde_json::Value::Object(serde_json::Map::default()) + ); + assert!(serde_json::to_string(&additional_field).unwrap() == r#"{"must_understand":true}"#); + + let additional_field: AdditionalField = AdditionalField::new("test", true); + assert!(additional_field.must_understand()); + assert!(additional_field.as_value() == &serde_json::Value::String("test".to_string())); + assert!(serde_json::to_string(&additional_field).unwrap() == r#""test""#); + + let additional_field: AdditionalField = AdditionalField::new(123, false); + assert!(!additional_field.must_understand()); + assert!( + additional_field.as_value() + == &serde_json::Value::Number(serde_json::Number::from(123)) + ); + assert!(serde_json::to_string(&additional_field).unwrap() == "123"); } #[test] @@ -127,20 +140,23 @@ mod tests { "unknown_field": { "key": "value", "must_understand": false - } - }"#; - let additional_fields = serde_json::from_str::(json); - assert!(additional_fields.is_ok()); - } - - #[test] - fn additional_fields_invalid() { - let json = r#"{ - "unknown_field": { + }, + "unsupported_field_1": { + "key": "value", + "must_understand": true + }, + "unsupported_field_2": { "key": "value" - } + }, + "unsupported_field_3": [], + "unsupported_field_4": "test" }"#; - let additional_fields = serde_json::from_str::(json); - assert!(additional_fields.is_err()); + let additional_fields = serde_json::from_str::(json).unwrap(); + assert!(additional_fields.len() == 5); + assert!(!additional_fields["unknown_field"].must_understand()); + assert!(additional_fields["unsupported_field_1"].must_understand()); + assert!(additional_fields["unsupported_field_2"].must_understand()); + assert!(additional_fields["unsupported_field_3"].must_understand()); + assert!(additional_fields["unsupported_field_4"].must_understand()); } } diff --git a/zarrs_metadata/src/v2/array.rs b/zarrs_metadata/src/v2/array.rs index aedebbfa..c679e5fb 100644 --- a/zarrs_metadata/src/v2/array.rs +++ b/zarrs_metadata/src/v2/array.rs @@ -296,7 +296,7 @@ impl Serialize for FillValueMetadataV2 { } /// The layout of bytes within each chunk of the array. -#[derive(Serialize, Deserialize, Clone, PartialEq, Eq, Debug)] +#[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq, Debug)] pub enum ArrayMetadataV2Order { /// Row-major order. The last dimension varies fastest. C, diff --git a/zarrs_metadata/src/v2_to_v3.rs b/zarrs_metadata/src/v2_to_v3.rs index a4de5c8a..3a203a10 100644 --- a/zarrs_metadata/src/v2_to_v3.rs +++ b/zarrs_metadata/src/v2_to_v3.rs @@ -11,7 +11,7 @@ use crate::{ data_type_metadata_v2_to_endianness, ArrayMetadataV2Order, DataTypeMetadataV2, DataTypeMetadataV2InvalidEndiannessError, FillValueMetadataV2, }, - ArrayMetadataV2, GroupMetadataV2, + ArrayMetadataV2, GroupMetadataV2, MetadataV2, }, v3::{ array::{ @@ -25,6 +25,7 @@ use crate::{ }, ArrayMetadataV3, GroupMetadataV3, MetadataV3, }, + Endianness, }; use super::v3::array::data_type::DataTypeMetadataV3; @@ -61,73 +62,28 @@ pub enum ArrayMetadataV2ToV3ConversionError { Other(String), } -/// Convert Zarr V2 array metadata to V3. +/// Convert Zarr V2 codec metadata to the equivalent Zarr V3 codec metadata. /// /// # Errors /// Returns a [`ArrayMetadataV2ToV3ConversionError`] if the metadata is invalid or is not compatible with Zarr V3 metadata. #[allow(clippy::too_many_lines)] -pub fn array_metadata_v2_to_v3( - array_metadata_v2: &ArrayMetadataV2, -) -> Result { - let shape = array_metadata_v2.shape.clone(); - let chunk_grid = MetadataV3::new_with_serializable_configuration( - crate::v3::array::chunk_grid::regular::IDENTIFIER, - &RegularChunkGridConfiguration { - chunk_shape: array_metadata_v2.chunks.clone(), - }, - )?; - - let (Ok(data_type), endianness) = ( - data_type_metadata_v2_to_v3_data_type(&array_metadata_v2.dtype), - data_type_metadata_v2_to_endianness(&array_metadata_v2.dtype) - .map_err(ArrayMetadataV2ToV3ConversionError::InvalidEndianness)?, - ) else { - return Err(ArrayMetadataV2ToV3ConversionError::UnsupportedDataType( - match &array_metadata_v2.dtype { - DataTypeMetadataV2::Simple(dtype) => dtype.clone(), - DataTypeMetadataV2::Structured(dtype) => { - return Err(ArrayMetadataV2ToV3ConversionError::UnsupportedDataType( - format!("{dtype:?}"), - )) - } - }, - )); - }; - - // Fill value - let mut fill_value = array_metadata_fill_value_v2_to_v3(&array_metadata_v2.fill_value) - .ok_or_else(|| { - // TODO: How best to deal with null fill values? What do other implementations do? - ArrayMetadataV2ToV3ConversionError::UnsupportedFillValue( - data_type.to_string(), - array_metadata_v2.fill_value.clone(), - ) - })?; - if data_type.name() == "bool" { - // Map a 0/1 scalar fill value to a bool - if let Some(fill_value_uint) = fill_value.try_as_uint::() { - if fill_value_uint == 0 { - fill_value = FillValueMetadataV3::Bool(false); - } else if fill_value_uint == 1 { - fill_value = FillValueMetadataV3::Bool(true); - } else { - return Err(ArrayMetadataV2ToV3ConversionError::UnsupportedFillValue( - data_type.to_string(), - array_metadata_v2.fill_value.clone(), - )); - } - } - } - +pub fn codec_metadata_v2_to_v3( + order: ArrayMetadataV2Order, + dimensionality: usize, + data_type: &DataTypeMetadataV3, + endianness: Option, + filters: &Option>, + compressor: &Option, +) -> Result, ArrayMetadataV2ToV3ConversionError> { let mut codecs: Vec = vec![]; // Array-to-array codecs - if array_metadata_v2.order == ArrayMetadataV2Order::F { + if order == ArrayMetadataV2Order::F { let transpose_metadata = MetadataV3::new_with_serializable_configuration( crate::v3::array::codec::transpose::IDENTIFIER, &TransposeCodecConfigurationV1 { order: { - let f_order: Vec = (0..array_metadata_v2.shape.len()).rev().collect(); + let f_order: Vec = (0..dimensionality).rev().collect(); unsafe { // SAFETY: f_order is valid TransposeOrder::new(&f_order).unwrap_unchecked() @@ -140,7 +96,7 @@ pub fn array_metadata_v2_to_v3( // Filters (array to array or array to bytes codecs) let mut has_array_to_bytes = false; - if let Some(filters) = &array_metadata_v2.filters { + if let Some(filters) = filters { for filter in filters { // TODO: Add a V2 registry with V2 to V3 conversion functions match filter.id() { @@ -163,7 +119,7 @@ pub fn array_metadata_v2_to_v3( } // Compressor (array to bytes codec) - if let Some(compressor) = &array_metadata_v2.compressor { + if let Some(compressor) = compressor { #[allow(clippy::single_match)] match compressor.id() { crate::v2::array::codec::zfpy::IDENTIFIER => { @@ -193,13 +149,15 @@ pub fn array_metadata_v2_to_v3( if !has_array_to_bytes { let bytes_metadata = MetadataV3::new_with_serializable_configuration( crate::v3::array::codec::bytes::IDENTIFIER, - &BytesCodecConfigurationV1 { endian: endianness }, + &BytesCodecConfigurationV1 { + endian: Some(endianness.unwrap_or(Endianness::native())), + }, )?; codecs.push(bytes_metadata); } // Compressor (bytes to bytes codec) - if let Some(compressor) = &array_metadata_v2.compressor { + if let Some(compressor) = compressor { match compressor.id() { crate::v2::array::codec::zfpy::IDENTIFIER | crate::v3::array::codec::pcodec::IDENTIFIER => { @@ -209,7 +167,7 @@ pub fn array_metadata_v2_to_v3( let blosc = serde_json::from_value::( serde_json::to_value(compressor.configuration())?, )?; - let configuration = codec_blosc_v2_numcodecs_to_v3(&blosc, &data_type); + let configuration = codec_blosc_v2_numcodecs_to_v3(&blosc, data_type); codecs.push(MetadataV3::new_with_serializable_configuration( crate::v3::array::codec::blosc::IDENTIFIER, &configuration, @@ -232,6 +190,88 @@ pub fn array_metadata_v2_to_v3( }; } + Ok(codecs) +} + +/// Convert Zarr V2 array metadata to V3. +/// +/// # Errors +/// Returns a [`ArrayMetadataV2ToV3ConversionError`] if the metadata is invalid or is not compatible with Zarr V3 metadata. +#[allow(clippy::too_many_lines)] +pub fn array_metadata_v2_to_v3( + array_metadata_v2: &ArrayMetadataV2, +) -> Result { + let shape = array_metadata_v2.shape.clone(); + let chunk_grid = MetadataV3::new_with_serializable_configuration( + crate::v3::array::chunk_grid::regular::IDENTIFIER, + &RegularChunkGridConfiguration { + chunk_shape: array_metadata_v2.chunks.clone(), + }, + )?; + + let (Ok(data_type), endianness) = ( + data_type_metadata_v2_to_v3_data_type(&array_metadata_v2.dtype), + data_type_metadata_v2_to_endianness(&array_metadata_v2.dtype) + .map_err(ArrayMetadataV2ToV3ConversionError::InvalidEndianness)?, + ) else { + return Err(ArrayMetadataV2ToV3ConversionError::UnsupportedDataType( + match &array_metadata_v2.dtype { + DataTypeMetadataV2::Simple(dtype) => dtype.clone(), + DataTypeMetadataV2::Structured(dtype) => { + return Err(ArrayMetadataV2ToV3ConversionError::UnsupportedDataType( + format!("{dtype:?}"), + )) + } + }, + )); + }; + + // Fill value + let mut fill_value = array_metadata_fill_value_v2_to_v3(&array_metadata_v2.fill_value) + .or_else(|| { + // Support zarr-python encoded string arrays with a `null` fill value + match data_type.name().as_str() { + "string" => Some(FillValueMetadataV3::String(String::new())), + _ => None, + } + }) + .ok_or_else(|| { + // TODO: How best to deal with null fill values? What do other implementations do? + ArrayMetadataV2ToV3ConversionError::UnsupportedFillValue( + data_type.to_string(), + array_metadata_v2.fill_value.clone(), + ) + })?; + if data_type.name() == "bool" { + // Map a 0/1 scalar fill value to a bool + if let Some(fill_value_uint) = fill_value.try_as_uint::() { + if fill_value_uint == 0 { + fill_value = FillValueMetadataV3::Bool(false); + } else if fill_value_uint == 1 { + fill_value = FillValueMetadataV3::Bool(true); + } else { + return Err(ArrayMetadataV2ToV3ConversionError::UnsupportedFillValue( + data_type.to_string(), + array_metadata_v2.fill_value.clone(), + )); + } + } + } else if data_type.name() == "string" { + // Add a special case for `zarr-python` string data with a 0 fill value -> empty string + if let Some(0) = fill_value.try_as_uint::() { + fill_value = FillValueMetadataV3::String(String::new()); + } + } + + let codecs = codec_metadata_v2_to_v3( + array_metadata_v2.order, + array_metadata_v2.shape.len(), + &data_type, + endianness, + &array_metadata_v2.filters, + &array_metadata_v2.compressor, + )?; + let chunk_key_encoding = MetadataV3::new_with_serializable_configuration( crate::v3::array::chunk_key_encoding::v2::IDENTIFIER, &V2ChunkKeyEncodingConfiguration { diff --git a/zarrs_metadata/src/v3.rs b/zarrs_metadata/src/v3.rs index d67b0b01..960e3c58 100644 --- a/zarrs_metadata/src/v3.rs +++ b/zarrs_metadata/src/v3.rs @@ -9,8 +9,8 @@ pub use group::GroupMetadataV3; mod metadata; pub use metadata::{ - AdditionalFields, ConfigurationInvalidError, MetadataConfiguration, MetadataV3, - UnsupportedAdditionalFieldError, + AdditionalField, AdditionalFields, ConfigurationInvalidError, MetadataConfiguration, + MetadataV3, UnsupportedAdditionalFieldError, }; /// V3 node metadata ([`ArrayMetadataV3`] or [`GroupMetadataV3`]). diff --git a/zarrs_metadata/src/v3/array/codec/zstd.rs b/zarrs_metadata/src/v3/array/codec/zstd.rs index 2989d432..515a46e4 100644 --- a/zarrs_metadata/src/v3/array/codec/zstd.rs +++ b/zarrs_metadata/src/v3/array/codec/zstd.rs @@ -1,5 +1,6 @@ use derive_more::{Display, From}; use serde::{Deserialize, Serialize}; +use serde_json::Value; /// The identifier for the `zstd` codec. pub const IDENTIFIER: &str = "zstd"; @@ -40,12 +41,27 @@ pub struct ZstdCompressionLevel(i32); impl<'de> serde::Deserialize<'de> for ZstdCompressionLevel { fn deserialize>(d: D) -> Result { - let number = serde_json::Number::deserialize(d)?; - if let Some(number) = number.as_i64() { - if (-131_072..=22).contains(&number) { - #[allow(clippy::cast_possible_truncation)] - return Ok(Self(number as i32)); + let value = Value::deserialize(d)?; + match value { + Value::Number(number) => { + if let Some(number) = number.as_i64() { + if (-131_072..=22).contains(&number) { + #[allow(clippy::cast_possible_truncation)] + return Ok(Self(number as i32)); + } + } } + Value::String(string) => { + // COMPATIBILITY: support data created with zarr-python that uses a string for the level + // https://github.com/zarr-developers/zarr-python/blob/a52048ddb2d5d069c3404e7457439a9ecb5e40c3/tests/test_v2.py#L278-L280 + if let Ok(number) = string.parse::() { + if (-131_072..=22).contains(&number) { + #[allow(clippy::cast_possible_truncation)] + return Ok(Self(number as i32)); + } + } + } + _ => {} } Err(serde::de::Error::custom( "Zstd compression level must be an integer between -131072 and 22", diff --git a/zarrs_metadata/src/v3/array/fill_value.rs b/zarrs_metadata/src/v3/array/fill_value.rs index 67e140cc..f0a3e067 100644 --- a/zarrs_metadata/src/v3/array/fill_value.rs +++ b/zarrs_metadata/src/v3/array/fill_value.rs @@ -6,6 +6,8 @@ //! //! The interpretation of fill values is data type dependent. +use std::mem::size_of; // TODO: Can be removed with Rust 1.80+ + use derive_more::{Display, From}; use half::{bf16, f16}; use num::traits::float::FloatCore; @@ -63,11 +65,11 @@ impl FillValueFloat { Self::Float(float) => T::from(*float), Self::HexString(hex_string) => { let bytes: &[u8] = hex_string.as_be_bytes(); - if bytes.len() == core::mem::size_of::() { + if bytes.len() == size_of::() { // NOTE: Cleaner way of doing this? - if core::mem::size_of::() == core::mem::size_of::() { + if size_of::() == size_of::() { T::from(f32::from_be_bytes(bytes.try_into().unwrap_or_default())) - } else if core::mem::size_of::() == core::mem::size_of::() { + } else if size_of::() == size_of::() { T::from(f64::from_be_bytes(bytes.try_into().unwrap_or_default())) } else { None @@ -228,11 +230,11 @@ impl FillValueMetadataV3 { F::Float(float) => T::from(*float), F::HexString(hex_string) => { let bytes = hex_string.as_be_bytes(); - if bytes.len() == core::mem::size_of::() { + if bytes.len() == size_of::() { // NOTE: Cleaner way of doing this? - if core::mem::size_of::() == core::mem::size_of::() { + if size_of::() == size_of::() { T::from(f32::from_be_bytes(bytes.try_into().unwrap_or_default())) - } else if core::mem::size_of::() == core::mem::size_of::() { + } else if size_of::() == size_of::() { T::from(f64::from_be_bytes(bytes.try_into().unwrap_or_default())) } else { None @@ -247,9 +249,9 @@ impl FillValueMetadataV3 { NF::PosInfinity => Some(T::infinity()), NF::NegInfinity => Some(T::neg_infinity()), NF::NaN => { - if core::mem::size_of::() == core::mem::size_of::() { + if size_of::() == size_of::() { T::from(ZARR_NAN_F32) - } else if core::mem::size_of::() == core::mem::size_of::() { + } else if size_of::() == size_of::() { T::from(ZARR_NAN_F64) } else { None diff --git a/zarrs_metadata/src/v3/metadata.rs b/zarrs_metadata/src/v3/metadata.rs index edcafa6d..bcc4279f 100644 --- a/zarrs_metadata/src/v3/metadata.rs +++ b/zarrs_metadata/src/v3/metadata.rs @@ -1,5 +1,6 @@ use derive_more::From; use serde::{de::DeserializeOwned, ser::SerializeMap, Deserialize, Serialize}; +use serde_json::Value; use thiserror::Error; /// Metadata with a name and optional configuration. @@ -33,7 +34,7 @@ pub struct MetadataV3 { } /// Configuration metadata. -pub type MetadataConfiguration = serde_json::Map; +pub type MetadataConfiguration = serde_json::Map; impl TryFrom<&str> for MetadataV3 { type Error = serde_json::Error; @@ -138,7 +139,7 @@ impl MetadataV3 { configuration: &TConfiguration, ) -> Result { let configuration = serde_json::to_value(configuration)?; - if let serde_json::Value::Object(configuration) = configuration { + if let Value::Object(configuration) = configuration { Ok(Self::new_with_configuration(name, configuration)) } else { Err(serde::ser::Error::custom( @@ -212,6 +213,7 @@ impl ConfigurationInvalidError { } } +// FIXME: Move to `zarrs` itself in 0.4.0 /// An unsupported additional field error. /// /// An unsupported field in array or group metadata is an unrecognised field without `"must_understand": false`. @@ -219,10 +221,16 @@ impl ConfigurationInvalidError { #[error("unsupported additional field {name} with value {value}")] pub struct UnsupportedAdditionalFieldError { name: String, - value: serde_json::Value, + value: Value, } impl UnsupportedAdditionalFieldError { + /// Create a new [`UnsupportedAdditionalFieldError`]. + #[must_use] + pub fn new(name: String, value: Value) -> UnsupportedAdditionalFieldError { + Self { name, value } + } + /// Return the name of the unsupported additional field. #[must_use] pub fn name(&self) -> &str { @@ -231,54 +239,110 @@ impl UnsupportedAdditionalFieldError { /// Return the value of the unsupported additional field. #[must_use] - pub const fn value(&self) -> &serde_json::Value { + pub const fn value(&self) -> &Value { &self.value } } /// An additional field in array or group metadata. /// -/// Must be an object with a `"must_understand": false` field. -#[derive(Serialize, Deserialize, Clone, Eq, PartialEq, Debug, Default, From)] +/// A field that is not recognised / supported by `zarrs` will be considered an additional field. +/// Additional fields can be any JSON type. +/// An array / group cannot be created with an additional field, unless the additional field is an object with a `"must_understand": false` field. +/// +/// ### Example additional field JSON +/// ```json +// "unknown_field": { +// "key": "value", +// "must_understand": false +// }, +// "unsupported_field_1": { +// "key": "value", +// "must_understand": true +// }, +// "unsupported_field_2": { +// "key": "value" +// }, +// "unsupported_field_3": [], +// "unsupported_field_4": "test" +/// ``` +#[derive(Clone, Eq, PartialEq, Debug, Default)] pub struct AdditionalField { - must_understand: monostate::MustBe!(false), - #[serde(flatten)] - fields: serde_json::Map, + field: Value, + must_understand: bool, } impl AdditionalField { - /// Return the underlying map. + /// Create a new additional field. + #[must_use] + pub fn new(field: impl Into, must_understand: bool) -> AdditionalField { + Self { + field: field.into(), + must_understand, + } + } + + /// Return the underlying value. + #[must_use] + pub const fn as_value(&self) -> &Value { + &self.field + } + + /// Return the `must_understand` component of the additional field. #[must_use] - pub const fn as_map(&self) -> &serde_json::Map { - &self.fields + pub const fn must_understand(&self) -> bool { + self.must_understand } } -impl From for serde_json::Map { - fn from(value: AdditionalField) -> Self { - value.fields +impl Serialize for AdditionalField { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + match &self.field { + Value::Object(object) => { + let mut map = serializer.serialize_map(Some(object.len() + 1))?; + map.serialize_entry("must_understand", &Value::Bool(self.must_understand))?; + for (k, v) in object { + map.serialize_entry(k, v)?; + } + map.end() + } + _ => self.field.serialize(serializer), + } + } +} + +impl<'de> serde::Deserialize<'de> for AdditionalField { + fn deserialize>(d: D) -> Result { + let value = Value::deserialize(d)?; + Ok(value.into()) } } -impl From> for AdditionalField { - fn from(value: serde_json::Map) -> Self { +impl From for AdditionalField +where + T: Into, +{ + fn from(field: T) -> Self { + let mut value: Value = field.into(); + let must_understand = if let Some(object) = value.as_object_mut() { + if let Some(Value::Bool(must_understand)) = object.remove("must_understand") { + must_understand + } else { + true + } + } else { + true + }; Self { - must_understand: monostate::MustBe!(false), - fields: value, + must_understand, + field: value, } } } /// Additional fields in array or group metadata. -/// -/// Additional fields are a JSON object with a `"must_understand": false` key-value pair. -/// -/// ### Example additional field JSON -/// ```json -/// "unknown_field": { -/// "key": "value", -/// "must_understand": false -/// } -/// ``` -// NOTE: It would be nice if this was just a serde_json::Map, but it only has implementations for ``. +// NOTE: It would be nice if this was just a serde_json::Map, but it only has implementations for ``. pub type AdditionalFields = std::collections::BTreeMap; diff --git a/zarrs_metadata/tests/extensions_zep_9.rs b/zarrs_metadata/tests/extensions_zep_9.rs new file mode 100644 index 00000000..4c623844 --- /dev/null +++ b/zarrs_metadata/tests/extensions_zep_9.rs @@ -0,0 +1,56 @@ +#![allow(missing_docs)] + +use zarrs_metadata::v3::ArrayMetadataV3; + +#[test] +fn array_extensions() { + let json = r#"{ + "zarr_format": 3, + "node_type": "array", + "data_type": "https://example.com/zarr/string", + "fill_value": "", + "chunk_key_encoding": { + "name": "default", + "configuration": { "separator": "." } + }, + "codecs": [ + { + "name": "https://numcodecs.dev/vlen-utf8" + }, + { + "name": "zstd", + "configuration": {} + } + ], + "chunk_grid": { + "name": "regular", + "configuration": { "chunk_shape": [ 32 ] } + }, + "shape": [ 128 ], + "dimension_names": [ "x" ], + "attributes": {}, + "storage_transformers": [], + "extensions": [ + { + "name": "https://example.com/zarr/offset", + "configuration": { "offset": [ 12 ] } + }, + { + "name": "https://example.com/zarr/array-statistics", + "configuration": { + "min": 5, + "max": 12 + }, + "must_understand": false + }, + { + "name": "https://example.com/zarr/consolidated-metadata", + "configuration": {}, + "must_understand": false + } + ] +}"#; + + let metadata: ArrayMetadataV3 = serde_json::from_str(&json).unwrap(); + assert_eq!(metadata.data_type.name(), "https://example.com/zarr/string"); +} diff --git a/zarrs_object_store/Cargo.toml b/zarrs_object_store/Cargo.toml index 7089eb66..977c62f7 100644 --- a/zarrs_object_store/Cargo.toml +++ b/zarrs_object_store/Cargo.toml @@ -7,7 +7,7 @@ rust-version = "1.77" description = "object_store store support for the zarrs crate" homepage = "https://zarrs.dev" documentation = "https://docs.rs/zarrs_object_store" -repository = "https://github.com/LDeakin/zarrs" +repository = "https://github.com/LDeakin/zarrs/tree/main/zarrs_object_store" license = "MIT OR Apache-2.0" keywords = ["zarr", "zarrs", "storage", "store"] categories = ["encoding"] diff --git a/zarrs_opendal/Cargo.toml b/zarrs_opendal/Cargo.toml index a7a39c85..cd93d7c0 100644 --- a/zarrs_opendal/Cargo.toml +++ b/zarrs_opendal/Cargo.toml @@ -7,7 +7,7 @@ rust-version = "1.77" description = "opendal store support for the zarrs crate" homepage = "https://zarrs.dev" documentation = "https://docs.rs/zarrs_opendal" -repository = "https://github.com/LDeakin/zarrs" +repository = "https://github.com/LDeakin/zarrs/tree/main/zarrs_opendal" license = "MIT OR Apache-2.0" keywords = ["zarr", "zarrs", "storage", "store"] categories = ["encoding"] diff --git a/zarrs_storage/Cargo.toml b/zarrs_storage/Cargo.toml index f7a9b873..7c8d619a 100644 --- a/zarrs_storage/Cargo.toml +++ b/zarrs_storage/Cargo.toml @@ -7,7 +7,7 @@ rust-version = "1.77" description = "The storage API and default stores for the zarrs crate" homepage = "https://zarrs.dev" documentation = "https://docs.rs/zarrs_storage" -repository = "https://github.com/LDeakin/zarrs" +repository = "https://github.com/LDeakin/zarrs/tree/main/zarrs_storage" license = "MIT OR Apache-2.0" keywords = ["zarr", "zarrs", "storage", "store"] categories = ["encoding"] @@ -28,7 +28,7 @@ bytes = "1.6.0" derive_more = { version = "1.0.0", features = ["deref", "display", "from"] } futures = { version = "0.3.29", optional = true } itertools = "0.14.0" -parking_lot = "0.12.0" +parking_lot = "0.12.0" # TODO: Remove with Rust 1.78+ thiserror = "2.0.0" unsafe_cell_slice = "0.2.0" diff --git a/zarrs_storage/src/store/memory_store.rs b/zarrs_storage/src/store/memory_store.rs index f92bbd6f..93139595 100644 --- a/zarrs_storage/src/store/memory_store.rs +++ b/zarrs_storage/src/store/memory_store.rs @@ -1,6 +1,6 @@ //! A synchronous in-memory store. -use parking_lot::RwLock; +use parking_lot::RwLock; // TODO: std::sync::RwLock with Rust 1.78+ use std::sync::Mutex; use crate::{ diff --git a/zarrs_zip/Cargo.toml b/zarrs_zip/Cargo.toml index 14fefe91..1ca195fa 100644 --- a/zarrs_zip/Cargo.toml +++ b/zarrs_zip/Cargo.toml @@ -7,7 +7,7 @@ rust-version = "1.77" description = "A storage adapter for zip files for the zarrs crate" homepage = "https://zarrs.dev" documentation = "https://docs.rs/zarrs_zip" -repository = "https://github.com/LDeakin/zarrs" +repository = "https://github.com/LDeakin/zarrs/tree/main/zarrs_zip" license = "MIT OR Apache-2.0" keywords = ["zarr", "zarrs", "storage", "store", "zip"] categories = ["encoding"]