From 8289587aba5a442dc357b488af4e6fe4d77978f9 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Mon, 13 Jan 2025 07:33:56 +1100 Subject: [PATCH 01/45] chore: add todo for removing parking lot dependency --- zarrs_filesystem/Cargo.toml | 2 +- zarrs_filesystem/src/lib.rs | 2 +- zarrs_storage/Cargo.toml | 2 +- zarrs_storage/src/store/memory_store.rs | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/zarrs_filesystem/Cargo.toml b/zarrs_filesystem/Cargo.toml index a26b2cc1..fabd2bba 100644 --- a/zarrs_filesystem/Cargo.toml +++ b/zarrs_filesystem/Cargo.toml @@ -21,7 +21,7 @@ derive_more = { version = "1.0.0", features = ["from"] } itertools = "0.14.0" libc = "0.2.158" page_size = "0.6.0" -parking_lot = "0.12.0" +parking_lot = "0.12.0" # TODO: Remove with Rust 1.78+ pathdiff = "0.2.0" thiserror = "2.0.0" walkdir = "2.3.2" diff --git a/zarrs_filesystem/src/lib.rs b/zarrs_filesystem/src/lib.rs index 103756c6..18ecd261 100644 --- a/zarrs_filesystem/src/lib.rs +++ b/zarrs_filesystem/src/lib.rs @@ -15,7 +15,7 @@ use zarrs_storage::{ }; use bytes::BytesMut; -use parking_lot::RwLock; +use parking_lot::RwLock; // TODO: std::sync::RwLock with Rust 1.78+ use thiserror::Error; use walkdir::WalkDir; diff --git a/zarrs_storage/Cargo.toml b/zarrs_storage/Cargo.toml index f7a9b873..a14d9931 100644 --- a/zarrs_storage/Cargo.toml +++ b/zarrs_storage/Cargo.toml @@ -28,7 +28,7 @@ bytes = "1.6.0" derive_more = { version = "1.0.0", features = ["deref", "display", "from"] } futures = { version = "0.3.29", optional = true } itertools = "0.14.0" -parking_lot = "0.12.0" +parking_lot = "0.12.0" # TODO: Remove with Rust 1.78+ thiserror = "2.0.0" unsafe_cell_slice = "0.2.0" diff --git a/zarrs_storage/src/store/memory_store.rs b/zarrs_storage/src/store/memory_store.rs index f92bbd6f..93139595 100644 --- a/zarrs_storage/src/store/memory_store.rs +++ b/zarrs_storage/src/store/memory_store.rs @@ -1,6 +1,6 @@ //! A synchronous in-memory store. -use parking_lot::RwLock; +use parking_lot::RwLock; // TODO: std::sync::RwLock with Rust 1.78+ use std::sync::Mutex; use crate::{ From 4a240907dbd7894d5dc1a271c865a08619729248 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Mon, 13 Jan 2025 07:42:26 +1100 Subject: [PATCH 02/45] chore: use `size_of` from prelude Rust 1.80+ --- zarrs/src/array.rs | 6 +++--- zarrs/src/array/array_bytes.rs | 1 - zarrs/src/array/chunk_cache/chunk_cache_lru.rs | 2 +- zarrs/src/array/codec/array_to_bytes/bytes.rs | 4 ++-- .../array/codec/array_to_bytes/codec_chain.rs | 2 +- zarrs/src/array/codec/array_to_bytes/pcodec.rs | 4 ++-- .../src/array/codec/array_to_bytes/sharding.rs | 10 +++++----- zarrs/src/array/codec/array_to_bytes/vlen.rs | 2 +- .../codec/array_to_bytes/vlen/vlen_codec.rs | 2 +- .../src/array/codec/array_to_bytes/vlen_v2.rs | 2 +- .../array_to_bytes/vlen_v2/vlen_v2_codec.rs | 2 +- zarrs/src/array/codec/array_to_bytes/zfp.rs | 4 ++-- zarrs/src/array/codec/bytes_to_bytes/blosc.rs | 4 ++-- zarrs/src/array/codec/bytes_to_bytes/bz2.rs | 4 ++-- zarrs/src/array/codec/bytes_to_bytes/crc32c.rs | 5 ++--- .../array/codec/bytes_to_bytes/fletcher32.rs | 5 ++--- .../src/array/codec/bytes_to_bytes/gdeflate.rs | 5 ++--- .../bytes_to_bytes/gdeflate/gdeflate_codec.rs | 1 - zarrs/src/array/codec/bytes_to_bytes/gzip.rs | 4 ++-- .../codec/bytes_to_bytes/test_unbounded.rs | 4 ++-- zarrs/src/array/codec/bytes_to_bytes/zstd.rs | 4 ++-- zarrs/src/array/fill_value.rs | 4 ++-- zarrs/src/lib.rs | 2 +- zarrs/tests/array_partial_encode.rs | 1 - zarrs_metadata/src/v3/array/fill_value.rs | 18 ++++++++++-------- 25 files changed, 49 insertions(+), 53 deletions(-) diff --git a/zarrs/src/array.rs b/zarrs/src/array.rs index e76d28db..c85aff98 100644 --- a/zarrs/src/array.rs +++ b/zarrs/src/array.rs @@ -911,8 +911,8 @@ pub fn elements_to_ndarray( let length = elements.len(); ndarray::ArrayD::::from_shape_vec(iter_u64_to_usize(shape.iter()), elements).map_err(|_| { ArrayError::CodecError(codec::CodecError::UnexpectedChunkDecodedSize( - length * std::mem::size_of::(), - shape.iter().product::() * std::mem::size_of::() as u64, + length * size_of::(), + shape.iter().product::() * size_of::() as u64, )) }) } @@ -926,7 +926,7 @@ pub fn bytes_to_ndarray( shape: &[u64], bytes: Vec, ) -> Result, ArrayError> { - let expected_len = shape.iter().product::() * core::mem::size_of::() as u64; + let expected_len = shape.iter().product::() * size_of::() as u64; if bytes.len() as u64 != expected_len { return Err(ArrayError::InvalidBytesInputSize(bytes.len(), expected_len)); } diff --git a/zarrs/src/array/array_bytes.rs b/zarrs/src/array/array_bytes.rs index 6c8e171a..51db02c9 100644 --- a/zarrs/src/array/array_bytes.rs +++ b/zarrs/src/array/array_bytes.rs @@ -603,7 +603,6 @@ impl<'a, const N: usize> From<&'a [u8; N]> for ArrayBytes<'a> { #[cfg(test)] mod tests { use std::error::Error; - use std::mem::size_of; use crate::array::Element; diff --git a/zarrs/src/array/chunk_cache/chunk_cache_lru.rs b/zarrs/src/array/chunk_cache/chunk_cache_lru.rs index 73a2f332..fae78303 100644 --- a/zarrs/src/array/chunk_cache/chunk_cache_lru.rs +++ b/zarrs/src/array/chunk_cache/chunk_cache_lru.rs @@ -359,7 +359,7 @@ impl ChunkCache for ChunkCacheDecodedLruSizeLimitThreadLo mod tests { use super::*; - use std::{mem::size_of, sync::Arc}; + use std::sync::Arc; use crate::{ array::{ diff --git a/zarrs/src/array/codec/array_to_bytes/bytes.rs b/zarrs/src/array/codec/array_to_bytes/bytes.rs index 98ba7fc9..a2970240 100644 --- a/zarrs/src/array/codec/array_to_bytes/bytes.rs +++ b/zarrs/src/array/codec/array_to_bytes/bytes.rs @@ -297,7 +297,7 @@ mod tests { .map(|bytes| bytes.into_fixed().unwrap().to_vec()) .flatten() .collect::>() - .chunks(std::mem::size_of::()) + .chunks(size_of::()) .map(|b| u8::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![4, 8]; @@ -343,7 +343,7 @@ mod tests { .map(|bytes| bytes.into_fixed().unwrap().to_vec()) .flatten() .collect::>() - .chunks(std::mem::size_of::()) + .chunks(size_of::()) .map(|b| u8::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![4, 8]; diff --git a/zarrs/src/array/codec/array_to_bytes/codec_chain.rs b/zarrs/src/array/codec/array_to_bytes/codec_chain.rs index 2b41fe63..be8fae6a 100644 --- a/zarrs/src/array/codec/array_to_bytes/codec_chain.rs +++ b/zarrs/src/array/codec/array_to_bytes/codec_chain.rs @@ -852,7 +852,7 @@ mod tests { .map(|bytes| bytes.into_fixed().unwrap().to_vec()) .flatten() .collect::>() - .chunks(std::mem::size_of::()) + .chunks(size_of::()) .map(|b| f32::from_ne_bytes(b.try_into().unwrap())) .collect(); println!("decoded_partial_chunk {decoded_partial_chunk:?}"); diff --git a/zarrs/src/array/codec/array_to_bytes/pcodec.rs b/zarrs/src/array/codec/array_to_bytes/pcodec.rs index cf77b2ab..47cc8904 100644 --- a/zarrs/src/array/codec/array_to_bytes/pcodec.rs +++ b/zarrs/src/array/codec/array_to_bytes/pcodec.rs @@ -271,7 +271,7 @@ mod tests { .map(|bytes| bytes.into_fixed().unwrap().into_owned()) .flatten() .collect::>() - .chunks(std::mem::size_of::()) + .chunks(size_of::()) .map(|b| u8::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![4, 8]; @@ -323,7 +323,7 @@ mod tests { .map(|bytes| bytes.into_fixed().unwrap().into_owned()) .flatten() .collect::>() - .chunks(std::mem::size_of::()) + .chunks(size_of::()) .map(|b| u8::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![4, 8]; diff --git a/zarrs/src/array/codec/array_to_bytes/sharding.rs b/zarrs/src/array/codec/array_to_bytes/sharding.rs index b0a7e216..7e2a5955 100644 --- a/zarrs/src/array/codec/array_to_bytes/sharding.rs +++ b/zarrs/src/array/codec/array_to_bytes/sharding.rs @@ -111,7 +111,7 @@ fn decode_shard_index( )?; let decoded_shard_index = decoded_shard_index.into_fixed()?; Ok(decoded_shard_index - .chunks_exact(core::mem::size_of::()) + .chunks_exact(size_of::()) .map(|v| u64::from_ne_bytes(v.try_into().unwrap() /* safe */)) .collect()) } @@ -499,7 +499,7 @@ mod tests { .map(|bytes| bytes.into_fixed().unwrap().to_vec()) .flatten() .collect::>() - .chunks(std::mem::size_of::()) + .chunks(size_of::()) .map(|b| u8::from_ne_bytes(b.try_into().unwrap())) .collect(); assert_eq!(answer, decoded_partial_chunk); @@ -584,7 +584,7 @@ mod tests { .map(|bytes| bytes.into_fixed().unwrap().to_vec()) .flatten() .collect::>() - .chunks(std::mem::size_of::()) + .chunks(size_of::()) .map(|b| u8::from_ne_bytes(b.try_into().unwrap())) .collect(); assert_eq!(answer, decoded_partial_chunk); @@ -653,7 +653,7 @@ mod tests { .map(|bytes| bytes.into_fixed().unwrap().to_vec()) .flatten() .collect::>() - .chunks(std::mem::size_of::()) + .chunks(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); @@ -695,7 +695,7 @@ mod tests { .map(|bytes| bytes.into_fixed().unwrap().to_vec()) .flatten() .collect::>() - .chunks(std::mem::size_of::()) + .chunks(size_of::()) .map(|b| u8::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![4, 8]; diff --git a/zarrs/src/array/codec/array_to_bytes/vlen.rs b/zarrs/src/array/codec/array_to_bytes/vlen.rs index 4075aa32..5873fe7a 100644 --- a/zarrs/src/array/codec/array_to_bytes/vlen.rs +++ b/zarrs/src/array/codec/array_to_bytes/vlen.rs @@ -3,7 +3,7 @@ mod vlen_codec; mod vlen_partial_decoder; -use std::{mem::size_of, num::NonZeroU64, sync::Arc}; +use std::{num::NonZeroU64, sync::Arc}; use itertools::Itertools; pub use vlen::IDENTIFIER; diff --git a/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs b/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs index 4c181488..fcf08d4c 100644 --- a/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs +++ b/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs @@ -1,4 +1,4 @@ -use std::{mem::size_of, num::NonZeroU64, sync::Arc}; +use std::{num::NonZeroU64, sync::Arc}; use crate::{ array::{ diff --git a/zarrs/src/array/codec/array_to_bytes/vlen_v2.rs b/zarrs/src/array/codec/array_to_bytes/vlen_v2.rs index 5a36eb96..856d7566 100644 --- a/zarrs/src/array/codec/array_to_bytes/vlen_v2.rs +++ b/zarrs/src/array/codec/array_to_bytes/vlen_v2.rs @@ -5,7 +5,7 @@ mod vlen_v2_partial_decoder; pub(crate) mod vlen_v2_macros; -use std::{mem::size_of, sync::Arc}; +use std::sync::Arc; /// The identifier for the `vlen_v2` codec. pub(crate) const IDENTIFIER: &str = "vlen_v2"; diff --git a/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs b/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs index dd781f67..dec590c9 100644 --- a/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs +++ b/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs @@ -1,4 +1,4 @@ -use std::{mem::size_of, sync::Arc}; +use std::sync::Arc; use itertools::Itertools; diff --git a/zarrs/src/array/codec/array_to_bytes/zfp.rs b/zarrs/src/array/codec/array_to_bytes/zfp.rs index 8c48c08d..5ccdf3be 100644 --- a/zarrs/src/array/codec/array_to_bytes/zfp.rs +++ b/zarrs/src/array/codec/array_to_bytes/zfp.rs @@ -554,7 +554,7 @@ mod tests { .map(|bytes| bytes.into_fixed().unwrap().to_vec()) .flatten() .collect::>() - .chunks(std::mem::size_of::()) + .chunks(size_of::()) .map(|b| f32::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![ @@ -614,7 +614,7 @@ mod tests { .map(|bytes| bytes.into_fixed().unwrap().to_vec()) .flatten() .collect::>() - .chunks(std::mem::size_of::()) + .chunks(size_of::()) .map(|b| f32::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![ diff --git a/zarrs/src/array/codec/bytes_to_bytes/blosc.rs b/zarrs/src/array/codec/bytes_to_bytes/blosc.rs index 9ad54c9f..d3f20b10 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/blosc.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/blosc.rs @@ -379,7 +379,7 @@ mod tests { let decoded: Vec = decoded .to_vec() - .chunks_exact(std::mem::size_of::()) + .chunks_exact(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); @@ -428,7 +428,7 @@ mod tests { let decoded: Vec = decoded .to_vec() - .chunks_exact(std::mem::size_of::()) + .chunks_exact(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); diff --git a/zarrs/src/array/codec/bytes_to_bytes/bz2.rs b/zarrs/src/array/codec/bytes_to_bytes/bz2.rs index 58eff05a..6a182f49 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/bz2.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/bz2.rs @@ -126,7 +126,7 @@ mod tests { let decoded: Vec = decoded .to_vec() - .chunks_exact(std::mem::size_of::()) + .chunks_exact(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); @@ -174,7 +174,7 @@ mod tests { let decoded: Vec = decoded .to_vec() - .chunks_exact(std::mem::size_of::()) + .chunks_exact(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); diff --git a/zarrs/src/array/codec/bytes_to_bytes/crc32c.rs b/zarrs/src/array/codec/bytes_to_bytes/crc32c.rs index e95ff365..7f8667cd 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/crc32c.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/crc32c.rs @@ -38,7 +38,7 @@ pub(crate) fn create_codec_crc32c(metadata: &MetadataV3) -> Result(); +const CHECKSUM_SIZE: usize = size_of::(); #[cfg(test)] mod tests { @@ -89,8 +89,7 @@ mod tests { assert_eq!(bytes, decoded.to_vec()); // Check that the checksum is correct - let checksum: &[u8; 4] = &encoded - [encoded.len() - core::mem::size_of::()..encoded.len()] + let checksum: &[u8; 4] = &encoded[encoded.len() - size_of::()..encoded.len()] .try_into() .unwrap(); println!("checksum {checksum:?}"); diff --git a/zarrs/src/array/codec/bytes_to_bytes/fletcher32.rs b/zarrs/src/array/codec/bytes_to_bytes/fletcher32.rs index 74411496..159ed82b 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/fletcher32.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/fletcher32.rs @@ -53,7 +53,7 @@ pub(crate) fn create_codec_fletcher32(metadata: &MetadataV3) -> Result(); +const CHECKSUM_SIZE: usize = size_of::(); #[cfg(test)] mod tests { @@ -106,8 +106,7 @@ mod tests { assert_eq!(bytes, decoded.to_vec()); // Check that the checksum is correct - let checksum: &[u8; 4] = &encoded - [encoded.len() - core::mem::size_of::()..encoded.len()] + let checksum: &[u8; 4] = &encoded[encoded.len() - size_of::()..encoded.len()] .try_into() .unwrap(); println!("checksum {checksum:?}"); diff --git a/zarrs/src/array/codec/bytes_to_bytes/gdeflate.rs b/zarrs/src/array/codec/bytes_to_bytes/gdeflate.rs index 9a14cf0c..ac74783e 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/gdeflate.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/gdeflate.rs @@ -37,7 +37,6 @@ use crate::{ pub use gdeflate::IDENTIFIER; -use core::mem::size_of; use std::sync::Arc; // Register the codec. @@ -329,7 +328,7 @@ mod tests { let decoded_partial_chunk: Vec = decoded_partial_chunk .to_vec() - .chunks_exact(std::mem::size_of::()) + .chunks_exact(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![2, 3, 5]; @@ -372,7 +371,7 @@ mod tests { let decoded_partial_chunk: Vec = decoded_partial_chunk .to_vec() - .chunks_exact(std::mem::size_of::()) + .chunks_exact(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![2, 3, 5]; diff --git a/zarrs/src/array/codec/bytes_to_bytes/gdeflate/gdeflate_codec.rs b/zarrs/src/array/codec/bytes_to_bytes/gdeflate/gdeflate_codec.rs index a26dc588..93ac2590 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/gdeflate/gdeflate_codec.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/gdeflate/gdeflate_codec.rs @@ -1,4 +1,3 @@ -use core::mem::size_of; use std::{borrow::Cow, sync::Arc}; use crate::{ diff --git a/zarrs/src/array/codec/bytes_to_bytes/gzip.rs b/zarrs/src/array/codec/bytes_to_bytes/gzip.rs index f9dab455..7be5438e 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/gzip.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/gzip.rs @@ -129,7 +129,7 @@ mod tests { let decoded_partial_chunk: Vec = decoded_partial_chunk .to_vec() - .chunks_exact(std::mem::size_of::()) + .chunks_exact(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![2, 3, 5]; @@ -171,7 +171,7 @@ mod tests { let decoded_partial_chunk: Vec = decoded_partial_chunk .to_vec() - .chunks_exact(std::mem::size_of::()) + .chunks_exact(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![2, 3, 5]; diff --git a/zarrs/src/array/codec/bytes_to_bytes/test_unbounded.rs b/zarrs/src/array/codec/bytes_to_bytes/test_unbounded.rs index 8336d9ba..90b379df 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/test_unbounded.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/test_unbounded.rs @@ -69,7 +69,7 @@ mod tests { let decoded_partial_chunk: Vec = decoded_partial_chunk .to_vec() - .chunks_exact(std::mem::size_of::()) + .chunks_exact(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![2, 3, 5]; @@ -112,7 +112,7 @@ mod tests { let decoded_partial_chunk: Vec = decoded_partial_chunk .to_vec() - .chunks_exact(std::mem::size_of::()) + .chunks_exact(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![2, 3, 5]; diff --git a/zarrs/src/array/codec/bytes_to_bytes/zstd.rs b/zarrs/src/array/codec/bytes_to_bytes/zstd.rs index 5e4ed1b5..88192699 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/zstd.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/zstd.rs @@ -114,7 +114,7 @@ mod tests { let decoded_partial_chunk: Vec = decoded_partial_chunk .to_vec() - .chunks_exact(std::mem::size_of::()) + .chunks_exact(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![2, 3, 5]; @@ -157,7 +157,7 @@ mod tests { let decoded_partial_chunk: Vec = decoded_partial_chunk .to_vec() - .chunks_exact(std::mem::size_of::()) + .chunks_exact(size_of::()) .map(|b| u16::from_ne_bytes(b.try_into().unwrap())) .collect(); let answer: Vec = vec![2, 3, 5]; diff --git a/zarrs/src/array/fill_value.rs b/zarrs/src/array/fill_value.rs index f885d8ce..98560702 100644 --- a/zarrs/src/array/fill_value.rs +++ b/zarrs/src/array/fill_value.rs @@ -118,7 +118,7 @@ impl From for FillValue { impl From for FillValue { fn from(value: num::complex::Complex32) -> Self { - let mut bytes = Vec::with_capacity(std::mem::size_of::()); + let mut bytes = Vec::with_capacity(size_of::()); bytes.extend(value.re.to_ne_bytes()); bytes.extend(value.im.to_ne_bytes()); Self(bytes) @@ -127,7 +127,7 @@ impl From for FillValue { impl From for FillValue { fn from(value: num::complex::Complex64) -> Self { - let mut bytes = Vec::with_capacity(std::mem::size_of::()); + let mut bytes = Vec::with_capacity(size_of::()); bytes.extend(value.re.to_ne_bytes()); bytes.extend(value.im.to_ne_bytes()); Self(bytes) diff --git a/zarrs/src/lib.rs b/zarrs/src/lib.rs index 5f9b9482..8b110100 100644 --- a/zarrs/src/lib.rs +++ b/zarrs/src/lib.rs @@ -204,7 +204,7 @@ pub use storage::byte_range; /// Get a mutable slice of the spare capacity in a vector. fn vec_spare_capacity_to_mut_slice(vec: &mut Vec) -> &mut [T] { let spare_capacity = vec.spare_capacity_mut(); - // SAFETY: `spare_capacity` is valid for both reads and writes for len * mem::size_of::() many bytes, and it is properly aligned + // SAFETY: `spare_capacity` is valid for both reads and writes for len * size_of::() many bytes, and it is properly aligned unsafe { std::slice::from_raw_parts_mut( spare_capacity.as_mut_ptr().cast::(), diff --git a/zarrs/tests/array_partial_encode.rs b/zarrs/tests/array_partial_encode.rs index 3fb4768d..6af530cc 100644 --- a/zarrs/tests/array_partial_encode.rs +++ b/zarrs/tests/array_partial_encode.rs @@ -3,7 +3,6 @@ use std::sync::Arc; -use core::mem::size_of; use zarrs::{ array::{ codec::{ diff --git a/zarrs_metadata/src/v3/array/fill_value.rs b/zarrs_metadata/src/v3/array/fill_value.rs index 67e140cc..f0a3e067 100644 --- a/zarrs_metadata/src/v3/array/fill_value.rs +++ b/zarrs_metadata/src/v3/array/fill_value.rs @@ -6,6 +6,8 @@ //! //! The interpretation of fill values is data type dependent. +use std::mem::size_of; // TODO: Can be removed with Rust 1.80+ + use derive_more::{Display, From}; use half::{bf16, f16}; use num::traits::float::FloatCore; @@ -63,11 +65,11 @@ impl FillValueFloat { Self::Float(float) => T::from(*float), Self::HexString(hex_string) => { let bytes: &[u8] = hex_string.as_be_bytes(); - if bytes.len() == core::mem::size_of::() { + if bytes.len() == size_of::() { // NOTE: Cleaner way of doing this? - if core::mem::size_of::() == core::mem::size_of::() { + if size_of::() == size_of::() { T::from(f32::from_be_bytes(bytes.try_into().unwrap_or_default())) - } else if core::mem::size_of::() == core::mem::size_of::() { + } else if size_of::() == size_of::() { T::from(f64::from_be_bytes(bytes.try_into().unwrap_or_default())) } else { None @@ -228,11 +230,11 @@ impl FillValueMetadataV3 { F::Float(float) => T::from(*float), F::HexString(hex_string) => { let bytes = hex_string.as_be_bytes(); - if bytes.len() == core::mem::size_of::() { + if bytes.len() == size_of::() { // NOTE: Cleaner way of doing this? - if core::mem::size_of::() == core::mem::size_of::() { + if size_of::() == size_of::() { T::from(f32::from_be_bytes(bytes.try_into().unwrap_or_default())) - } else if core::mem::size_of::() == core::mem::size_of::() { + } else if size_of::() == size_of::() { T::from(f64::from_be_bytes(bytes.try_into().unwrap_or_default())) } else { None @@ -247,9 +249,9 @@ impl FillValueMetadataV3 { NF::PosInfinity => Some(T::infinity()), NF::NegInfinity => Some(T::neg_infinity()), NF::NaN => { - if core::mem::size_of::() == core::mem::size_of::() { + if size_of::() == size_of::() { T::from(ZARR_NAN_F32) - } else if core::mem::size_of::() == core::mem::size_of::() { + } else if size_of::() == size_of::() { T::from(ZARR_NAN_F64) } else { None From 09d7a291533b24ba28179ebbf9b932e022208224 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Mon, 13 Jan 2025 07:47:46 +1100 Subject: [PATCH 03/45] chore: use `&raw mut` operator Rust 1.82+ --- zarrs/src/array/codec/bytes_to_bytes/blosc.rs | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/zarrs/src/array/codec/bytes_to_bytes/blosc.rs b/zarrs/src/array/codec/bytes_to_bytes/blosc.rs index d3f20b10..b8a00612 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/blosc.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/blosc.rs @@ -127,11 +127,7 @@ fn blosc_compress_bytes( fn blosc_validate(src: &[u8]) -> Option { let mut destsize: usize = 0; let valid = unsafe { - blosc_cbuffer_validate( - src.as_ptr().cast::(), - src.len(), - std::ptr::addr_of_mut!(destsize), - ) + blosc_cbuffer_validate(src.as_ptr().cast::(), src.len(), &raw mut destsize) } == 0; valid.then_some(destsize) } @@ -145,8 +141,8 @@ fn blosc_typesize(src: &[u8]) -> Option { unsafe { blosc_cbuffer_metainfo( src.as_ptr().cast::(), - std::ptr::addr_of_mut!(typesize), - std::ptr::addr_of_mut!(flags), + &raw mut typesize, + &raw mut flags, ); }; (typesize != 0).then_some(typesize) @@ -164,9 +160,9 @@ fn blosc_nbytes(src: &[u8]) -> Option { unsafe { blosc_cbuffer_sizes( src.as_ptr().cast::(), - std::ptr::addr_of_mut!(uncompressed_bytes), - std::ptr::addr_of_mut!(cbytes), - std::ptr::addr_of_mut!(blocksize), + &raw mut uncompressed_bytes, + &raw mut cbytes, + &raw mut blocksize, ); }; (uncompressed_bytes > 0 && cbytes > 0 && blocksize > 0).then_some(uncompressed_bytes) From 6850c0554435003f042f0c72ba90d4bbac4c518f Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Mon, 13 Jan 2025 07:52:54 +1100 Subject: [PATCH 04/45] chore: use `{f32,f64}::from_bits` Rust 1.82+ --- zarrs_metadata/src/v3/array/nan_representations.rs | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/zarrs_metadata/src/v3/array/nan_representations.rs b/zarrs_metadata/src/v3/array/nan_representations.rs index dc758633..cd054db4 100644 --- a/zarrs_metadata/src/v3/array/nan_representations.rs +++ b/zarrs_metadata/src/v3/array/nan_representations.rs @@ -2,24 +2,16 @@ //! //! Zarr uses the not-a-number (NaN) value where the sign bit is 0 (positive), the most significant bit (MSB) of the mantissa is 1, and all other bits of the mantissa are zero. -use std::mem::transmute; - use half::{bf16, f16}; -// https://github.com/rust-lang/rust/issues/72447 - /// The Zarr "NaN" fill value for a 64-bit IEEE 754 floating point number. #[allow(clippy::unusual_byte_groupings)] -pub const ZARR_NAN_F64: f64 = unsafe { - transmute::(0b0_11111111111_1000000000000000000000000000000000000000000000000000) -}; -// const ZARR_NAN_F64: f64 = f64::from_bits(0b0_11111111111_1000000000000000000000000000000000000000000000000000); +pub const ZARR_NAN_F64: f64 = + f64::from_bits(0b0_11111111111_1000000000000000000000000000000000000000000000000000); /// The Zarr "NaN" fill value for a 32-bit IEEE 754 floating point number. #[allow(clippy::unusual_byte_groupings)] -pub const ZARR_NAN_F32: f32 = - unsafe { transmute::(0b0_11111111_10000000000000000000000) }; -// const ZARR_NAN_F32: f32 = f32::from_bits(0b0_11111111_10000000000000000000000); +pub const ZARR_NAN_F32: f32 = f32::from_bits(0b0_11111111_10000000000000000000000); /// The Zarr "NaN" fill value for a 16-bit IEEE 754 floating point number. pub const ZARR_NAN_F16: f16 = f16::NAN; From 7c38c8db8937d473702ef899fc894b5b614720e1 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Mon, 13 Jan 2025 07:58:26 +1100 Subject: [PATCH 05/45] chore: use `LazyLock` Rust 1.80+ --- zarrs/src/config.rs | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/zarrs/src/config.rs b/zarrs/src/config.rs index ec0a985f..4ef2e490 100644 --- a/zarrs/src/config.rs +++ b/zarrs/src/config.rs @@ -5,7 +5,7 @@ use crate::metadata::v3::array::codec; use std::{ collections::HashMap, - sync::{OnceLock, RwLock, RwLockReadGuard, RwLockWriteGuard}, + sync::{LazyLock, RwLock, RwLockReadGuard, RwLockWriteGuard}, }; #[cfg(doc)] @@ -282,17 +282,14 @@ impl Config { } } -static CONFIG: OnceLock> = OnceLock::new(); +static CONFIG: LazyLock> = LazyLock::new(|| RwLock::new(Config::default())); /// Returns a reference to the global `zarrs` configuration. /// /// # Panics /// This function panics if the underlying lock has been poisoned and might panic if the global config is already held by the current thread. pub fn global_config() -> RwLockReadGuard<'static, Config> { - CONFIG - .get_or_init(|| RwLock::new(Config::default())) - .read() - .unwrap() + CONFIG.read().unwrap() } /// Returns a mutable reference to the global `zarrs` configuration. @@ -300,10 +297,7 @@ pub fn global_config() -> RwLockReadGuard<'static, Config> { /// # Panics /// This function panics if the underlying lock has been poisoned and might panic if the global config is already held by the current thread. pub fn global_config_mut() -> RwLockWriteGuard<'static, Config> { - CONFIG - .get_or_init(|| RwLock::new(Config::default())) - .write() - .unwrap() + CONFIG.write().unwrap() } /// The metadata version to retrieve. From 66fbc0b8fd5722bf3afd7eb8d05c54ff4f18cac9 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Mon, 13 Jan 2025 08:00:47 +1100 Subject: [PATCH 06/45] chore: changelog for Rust 1.78-1.82 language/library features --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b01ad5de..74220c82 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed +- Use new language/library features added between Rust 1.78-1.82 (internal) + ## [0.19.0] - 2025-01-10 ### Highlights From a1c150697f2356ef6f9bcbeee3c3b9f2f04712db Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Mon, 13 Jan 2025 08:15:49 +1100 Subject: [PATCH 07/45] Revert "chore: use `{f32,f64}::from_bits`" This reverts commit 6850c0554435003f042f0c72ba90d4bbac4c518f. --- zarrs_metadata/src/v3/array/nan_representations.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/zarrs_metadata/src/v3/array/nan_representations.rs b/zarrs_metadata/src/v3/array/nan_representations.rs index cd054db4..dc758633 100644 --- a/zarrs_metadata/src/v3/array/nan_representations.rs +++ b/zarrs_metadata/src/v3/array/nan_representations.rs @@ -2,16 +2,24 @@ //! //! Zarr uses the not-a-number (NaN) value where the sign bit is 0 (positive), the most significant bit (MSB) of the mantissa is 1, and all other bits of the mantissa are zero. +use std::mem::transmute; + use half::{bf16, f16}; +// https://github.com/rust-lang/rust/issues/72447 + /// The Zarr "NaN" fill value for a 64-bit IEEE 754 floating point number. #[allow(clippy::unusual_byte_groupings)] -pub const ZARR_NAN_F64: f64 = - f64::from_bits(0b0_11111111111_1000000000000000000000000000000000000000000000000000); +pub const ZARR_NAN_F64: f64 = unsafe { + transmute::(0b0_11111111111_1000000000000000000000000000000000000000000000000000) +}; +// const ZARR_NAN_F64: f64 = f64::from_bits(0b0_11111111111_1000000000000000000000000000000000000000000000000000); /// The Zarr "NaN" fill value for a 32-bit IEEE 754 floating point number. #[allow(clippy::unusual_byte_groupings)] -pub const ZARR_NAN_F32: f32 = f32::from_bits(0b0_11111111_10000000000000000000000); +pub const ZARR_NAN_F32: f32 = + unsafe { transmute::(0b0_11111111_10000000000000000000000) }; +// const ZARR_NAN_F32: f32 = f32::from_bits(0b0_11111111_10000000000000000000000); /// The Zarr "NaN" fill value for a 16-bit IEEE 754 floating point number. pub const ZARR_NAN_F16: f16 = f16::NAN; From 61de648dfdecd7e11146a5cf72c9931b0e5af297 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Mon, 13 Jan 2025 08:25:06 +1100 Subject: [PATCH 08/45] chore(CI): reenable codecov (#130) --- .github/workflows/ci.yml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 802a1428..57a3c7d6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -87,16 +87,16 @@ jobs: - uses: Swatinem/rust-cache@v2 - run: cargo install cargo-hack cargo-minimal-versions --locked - run: cargo minimal-versions check --workspace --all-features --direct -# codecov: -# runs-on: ubuntu-latest -# steps: -# - uses: actions/checkout@v4 -# - run: sudo apt update && sudo apt install -y cmake clang-15 -# - uses: dtolnay/rust-toolchain@nightly -# - uses: Swatinem/rust-cache@v2 -# - run: cargo +nightly install cargo-llvm-cov --locked -# - run: cargo +nightly llvm-cov --all-features --doctests --lcov --output-path lcov.info -# - name: Upload coverage reports to Codecov -# uses: codecov/codecov-action@v4 -# with: -# token: ${{ secrets.CODECOV_TOKEN }} + codecov: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - run: sudo apt update && sudo apt install -y cmake clang-15 + - uses: dtolnay/rust-toolchain@nightly + - uses: Swatinem/rust-cache@v2 + - run: cargo +nightly install cargo-llvm-cov --locked + - run: cargo +nightly llvm-cov --all-features --doctests --lcov --output-path lcov.info + - name: Upload coverage reports to Codecov + uses: codecov/codecov-action@v4 + with: + token: ${{ secrets.CODECOV_TOKEN }} From feb1ee34403effd99aea6d05caffa40257eaeaad Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Mon, 13 Jan 2025 20:16:33 +1100 Subject: [PATCH 09/45] fix(docs): typo for `vlen` codec --- zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs b/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs index fcf08d4c..18d389f7 100644 --- a/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs +++ b/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs @@ -21,7 +21,7 @@ use crate::array::codec::{AsyncArrayPartialDecoderTraits, AsyncBytesPartialDecod use super::{vlen_partial_decoder, VlenCodecConfiguration, VlenCodecConfigurationV1}; -/// A `bytes` codec implementation. +/// A `vlen` codec implementation. #[derive(Debug, Clone)] pub struct VlenCodec { index_codecs: Arc, From e44051d40526964d7d2223bc120badf8fc58aa92 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Tue, 14 Jan 2025 08:00:38 +1100 Subject: [PATCH 10/45] fix: `clippy::double_ended_iterator_last` lint --- CHANGELOG.md | 3 +++ zarrs/src/node.rs | 7 ++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 74220c82..47a45c23 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Use new language/library features added between Rust 1.78-1.82 (internal) +### Fixed +- New clippy lints + ## [0.19.0] - 2025-01-10 ### Highlights diff --git a/zarrs/src/node.rs b/zarrs/src/node.rs index 66ee83c6..505c062e 100644 --- a/zarrs/src/node.rs +++ b/zarrs/src/node.rs @@ -322,7 +322,12 @@ impl Node { /// Returns the name of the node. #[must_use] pub fn name(&self) -> NodeName { - let name = self.path.as_str().split('/').last().unwrap_or_default(); + let name = self + .path + .as_str() + .split('/') + .next_back() + .unwrap_or_default(); unsafe { NodeName::new_unchecked(name) } } From 682cdc0591c15d5a6bebb5c16e93f58f00777a0a Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Tue, 14 Jan 2025 08:17:02 +1100 Subject: [PATCH 11/45] fix(docs): move the experimental partial encoding option Now correctly under the codec options heading --- zarrs/src/config.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/zarrs/src/config.rs b/zarrs/src/config.rs index 4ef2e490..f89410fb 100644 --- a/zarrs/src/config.rs +++ b/zarrs/src/config.rs @@ -54,6 +54,14 @@ use crate::array::{codec::CodecOptions, ArrayMetadataOptions}; /// This option sets the preferred minimum chunk concurrency. /// The concurrency of internal codecs is adjusted to accomodate for the chunk concurrency in accordance with the concurrent target set in the [`CodecOptions`] parameter of an encode or decode method. /// +/// ### Experimental Partial Encoding +/// > default: [`false`] +/// +/// If `true`, [`Array::store_chunk_subset`](crate::array::Array::store_chunk_subset) and [`Array::store_array_subset`](crate::array::Array::store_array_subset) and variants can use partial encoding. +/// This is relevant when using the sharding codec, as it enables inner chunks to be written without reading and writing entire shards. +/// +/// This is an experimental feature for now until it has more comprehensively tested and support is added in the async API. +/// /// ## Metadata Options /// /// ### Experimental Codec Store Metadata If Encode Only @@ -96,14 +104,6 @@ use crate::array::{codec::CodecOptions, ArrayMetadataOptions}; /// /// Sets the names used when serialising and deserialising the names of experimental codecs. /// Deserialisation also accepts the standard `IDENTIFIER` of the codec. -/// -/// ### Experimental Partial Encoding -/// > default: [`false`] -/// -/// If `true`, [`Array::store_chunk_subset`](crate::array::Array::store_chunk_subset) and [`Array::store_array_subset`](crate::array::Array::store_array_subset) and variants can use partial encoding. -/// This is relevant when using the sharding codec, as it enables inner chunks to be written without reading and writing entire shards. -/// -/// This is an experimental feature for now until it has more comprehensively tested and support is added in the async API. #[derive(Debug)] #[allow(clippy::struct_excessive_bools)] pub struct Config { From da007985444dccf0eda8234214229ca5fcdbf93b Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Thu, 16 Jan 2025 11:35:05 +1100 Subject: [PATCH 12/45] fix(docs): remove ecosystem tables Does not render particularly well on all devices/crates.io --- CHANGELOG.md | 1 + README.md | 133 ++++++++++++++++++++------------------- zarrs/doc/ecosystem.md | 89 ++++++++++++-------------- zarrs/doc/status/ZEPs.md | 14 ----- zarrs/src/lib.rs | 17 +++-- 5 files changed, 115 insertions(+), 139 deletions(-) delete mode 100644 zarrs/doc/status/ZEPs.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 47a45c23..14be0d1e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Use new language/library features added between Rust 1.78-1.82 (internal) +- Cleanup root docs and README removing ZEPs table and ecosystem table ### Fixed - New clippy lints diff --git a/README.md b/README.md index 9435fedc..694750fb 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,29 @@ # zarrs [![Latest Version](https://img.shields.io/crates/v/zarrs.svg)](https://crates.io/crates/zarrs) -[![zarrs documentation](https://docs.rs/zarrs/badge.svg)](https://docs.rs/zarrs) +[![zarrs documentation](https://docs.rs/zarrs/badge.svg)][documentation] ![msrv](https://img.shields.io/crates/msrv/zarrs) [![downloads](https://img.shields.io/crates/d/zarrs)](https://crates.io/crates/zarrs) [![build](https://github.com/LDeakin/zarrs/actions/workflows/ci.yml/badge.svg)](https://github.com/LDeakin/zarrs/actions/workflows/ci.yml) [![codecov](https://codecov.io/gh/LDeakin/zarrs/graph/badge.svg?token=OBKJQNAZPP)](https://codecov.io/gh/LDeakin/zarrs) [![DOI](https://zenodo.org/badge/695021547.svg)](https://zenodo.org/badge/latestdoi/695021547) -`zarrs` is a Rust library for the [Zarr](https://zarr.dev) storage format for multidimensional arrays and metadata. It supports [Zarr V3](https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html) and a [V3 compatible subset](https://docs.rs/zarrs/latest/zarrs/#implementation-status) of [Zarr V2](https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html). +`zarrs` is a Rust library for the [Zarr] storage format for multidimensional arrays and metadata. It supports [Zarr V3] and a V3 compatible subset of [Zarr V2]. -A changelog can be found [here](https://github.com/LDeakin/zarrs/blob/main/CHANGELOG.md). -Correctness issues with past versions are [detailed here](https://github.com/LDeakin/zarrs/blob/main/doc/correctness_issues.md). +A changelog can be found [here][CHANGELOG]. +Correctness issues with past versions are [detailed here][correctness_issues]. -Developed at the [Department of Materials Physics](https://physics.anu.edu.au/research/mp/), Australian National University, Canberra, Australia. +Developed at the [Department of Materials Physics, Australian National University, Canberra, Australia]. > [!TIP] -> If you are a Python user, check out [`zarrs-python`](https://github.com/ilan-gold/zarrs-python). -> It includes a high-performance codec pipeline for the reference [`zarr-python`](https://github.com/zarr-developers/zarr-python) implementation. +> If you are a Python user, check out [`zarrs-python`]. +> It includes a high-performance codec pipeline for the reference [`zarr-python`] implementation. ## Getting Started -- Review the [implementation status](https://docs.rs/zarrs/latest/zarrs/#implementation-status), [array support](https://docs.rs/zarrs/latest/zarrs/#array-support), and [storage support](https://docs.rs/zarrs/latest/zarrs/#storage-support). -- Read [The `zarrs` Book](https://book.zarrs.dev). -- View the [examples](https://github.com/LDeakin/zarrs/tree/main/zarrs/examples) and [the example below](#example). -- Read the [documentation](https://docs.rs/zarrs/latest/zarrs/). [`array::Array`](https://docs.rs/zarrs/latest/zarrs/array/struct.Array.html) is a good place to start. -- Check out the [`zarrs` ecosystem](#zarrs-ecosystem). +- Review the [implementation status] ([zarr version support], [array support], [storage support], and the [`zarrs` ecosystem](#zarrs-ecosystem)). +- Read [The `zarrs` Book]. +- View the [examples] and [the example below](#example). +- Read the [documentation]. ## Example ```rust @@ -90,61 +89,30 @@ println!("{array_ndarray:4}"); ## `zarrs` Ecosystem -| Crate | Docs / Description | -| --------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- | -| **Core** | | -| [![zarrs_ver]](https://crates.io/crates/zarrs) [zarrs] | [![docs]](https://docs.rs/zarrs) The core library for manipulating Zarr hierarchies | -| [![zarrs_metadata_ver]](https://crates.io/crates/zarrs_metadata) [zarrs_metadata] | [![docs]](https://docs.rs/zarrs_metadata) Zarr metadata support (re-exported as `zarrs::metadata`) | -| [![zarrs_storage_ver]](https://crates.io/crates/zarrs_storage) [zarrs_storage] | [![docs]](https://docs.rs/zarrs_storage) The storage API for `zarrs` (re-exported as `zarrs::storage`) | -| **Stores** | | -| [![zarrs_filesystem_ver]](https://crates.io/crates/zarrs_filesystem) [zarrs_filesystem] | [![docs]](https://docs.rs/zarrs_filesystem) A filesystem store (re-exported as `zarrs::filesystem`) | -| [![zarrs_object_store_ver]](https://crates.io/crates/zarrs_object_store) [zarrs_object_store] | [![docs]](https://docs.rs/zarrs_object_store) [`object_store`](https://docs.rs/object_store/latest/object_store/) store support | -| [![zarrs_opendal_ver]](https://crates.io/crates/zarrs_opendal) [zarrs_opendal] | [![docs]](https://docs.rs/zarrs_opendal) [`opendal`](https://docs.rs/opendal/latest/opendal/) store support | -| [![zarrs_http_ver]](https://crates.io/crates/zarrs_http) [zarrs_http] | [![docs]](https://docs.rs/zarrs_http) A synchronous http store | -| [![zarrs_zip_ver]](https://crates.io/crates/zarrs_zip) [zarrs_zip] | [![docs]](https://docs.rs/zarrs_zip) A storage adapter for zip files | -| [![zarrs_icechunk_ver]](https://crates.io/crates/zarrs_icechunk) [zarrs_icechunk] | [![docs]](https://docs.rs/zarrs_icechunk) [`icechunk`](https://docs.rs/icechunk/latest/icechunk/) store support | -| **Bindings** | | -| [![zarrs_python_ver]](https://pypi.org/project/zarrs/) [zarrs-python] | [![docs]](https://zarrs-python.readthedocs.io/en/latest/) A codec pipeline for [zarr-python] | -| [![zarrs_ffi_ver]](https://crates.io/crates/zarrs_ffi) [zarrs_ffi] | [![docs]](https://docs.rs/zarrs_ffi) A subset of `zarrs` exposed as a C/C++ API | -| **Zarr Metadata Conventions** | | -| [![ome_zarr_metadata_ver]](https://crates.io/crates/ome_zarr_metadata) [ome_zarr_metadata] | [![docs]](https://docs.rs/ome_zarr_metadata) A library for OME-Zarr (previously OME-NGFF) metadata | - -[docs]: https://img.shields.io/badge/docs-brightgreen -[zarrs_ver]: https://img.shields.io/crates/v/zarrs -[zarrs]: https://github.com/LDeakin/zarrs/tree/main/zarrs -[zarrs_metadata_ver]: https://img.shields.io/crates/v/zarrs_metadata -[zarrs_metadata]: https://github.com/LDeakin/zarrs/tree/main/zarrs_metadata -[zarrs_storage_ver]: https://img.shields.io/crates/v/zarrs_storage -[zarrs_storage]: https://github.com/LDeakin/zarrs/tree/main/zarrs_storage -[zarrs_filesystem_ver]: https://img.shields.io/crates/v/zarrs_filesystem -[zarrs_filesystem]: https://github.com/LDeakin/zarrs/tree/main/zarrs_filesystem -[zarrs_http_ver]: https://img.shields.io/crates/v/zarrs_http -[zarrs_http]: https://github.com/LDeakin/zarrs/tree/main/zarrs_http -[zarrs_object_store_ver]: https://img.shields.io/crates/v/zarrs_object_store -[zarrs_object_store]: https://github.com/LDeakin/zarrs/tree/main/zarrs_object_store -[zarrs_opendal_ver]: https://img.shields.io/crates/v/zarrs_opendal -[zarrs_opendal]: https://github.com/LDeakin/zarrs/tree/main/zarrs_opendal -[zarrs_zip_ver]: https://img.shields.io/crates/v/zarrs_zip -[zarrs_zip]: https://github.com/LDeakin/zarrs/tree/main/zarrs_zip -[zarrs_icechunk_ver]: https://img.shields.io/crates/v/zarrs_icechunk -[zarrs_icechunk]: https://github.com/LDeakin/zarrs_icechunk -[zarrs_ffi_ver]: https://img.shields.io/crates/v/zarrs_ffi -[zarrs_ffi]: https://github.com/LDeakin/zarrs_ffi -[zarrs_python_ver]: https://img.shields.io/pypi/v/zarrs -[zarrs-python]: https://github.com/ilan-gold/zarrs-python -[zarr-python]: https://github.com/zarr-developers/zarr-python -[ome_zarr_metadata_ver]: https://img.shields.io/crates/v/ome_zarr_metadata -[ome_zarr_metadata]: https://github.com/LDeakin/rust_ome_zarr_metadata - -#### [zarrs_tools] -[![zarrs_tools_ver]](https://crates.io/crates/zarrs_tools) [![zarrs_tools_doc]](https://docs.rs/zarrs_tools) - -[zarrs_tools]: https://github.com/LDeakin/zarrs_tools -[zarrs_tools_ver]: https://img.shields.io/crates/v/zarrs_tools.svg -[zarrs_tools_doc]: https://docs.rs/zarrs_tools/badge.svg +### Core +- [`zarrs`]: The core library for manipulating Zarr hierarchies. +- [`zarrs_metadata`]: Zarr metadata support (re-exported as `zarrs::metadata`). +- [`zarrs_storage`]: The storage API for `zarrs` (re-exported as `zarrs::storage`). +### Stores +- [`zarrs_filesystem`]: A filesystem store (re-exported as `zarrs::filesystem`). +- [`zarrs_object_store`]: [`object_store`] store support. +- [`zarrs_opendal`]: [`opendal`] store support. +- [`zarrs_http`]: A synchronous http store. +- [`zarrs_zip`]: A storage adapter for zip files. +- [`zarrs_icechunk`]: [`icechunk`] store support. + +### Bindings +- [`zarrs-python`]: A high-performance codec pipeline for [`zarr-python`]. +- [`zarrs_ffi`]: A subset of `zarrs` exposed as a C/C++ API. + +### Zarr Metadata Conventions +- [`ome_zarr_metadata`]: A library for OME-Zarr (previously OME-NGFF) metadata. + +### Tools +- [`zarrs_tools`]: Various tools for creating and manipulating Zarr V3 data with the zarrs rust crate - A reencoder that can change codecs, chunk shape, convert Zarr V2 to V3, etc. - - Create an [OME-Zarr](https://ngff.openmicroscopy.org/latest/) hierarchy from a Zarr array. + - Create an [OME-Zarr] hierarchy from a Zarr array. - Transform arrays: crop, rescale, downsample, gradient magnitude, gaussian, noise filtering, etc. - Benchmarking tools and performance benchmarks of `zarrs`. @@ -154,3 +122,38 @@ println!("{array_ndarray:4}"); - the MIT license [LICENSE-MIT](./LICENCE-MIT) or , at your option. Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions. + +[CHANGELOG]: https://github.com/LDeakin/zarrs/blob/main/CHANGELOG.md +[correctness_issues]: https://github.com/LDeakin/zarrs/blob/main/doc/correctness_issues.md +[implementation status]: https://docs.rs/zarrs/latest/zarrs/#implementation-status +[zarr version support]: https://docs.rs/zarrs/latest/zarrs/#zarr-version-support +[array support]: https://docs.rs/zarrs/latest/zarrs/#array-support +[storage support]: https://docs.rs/zarrs/latest/zarrs/#storage-support +[examples]: https://github.com/LDeakin/zarrs/tree/main/zarrs/examples +[documentation]: https://docs.rs/zarrs/latest/zarrs/ +[The `zarrs` Book]: https://book.zarrs.dev + +[`zarrs`]: https://github.com/LDeakin/zarrs/tree/main/zarrs +[`zarrs_metadata`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_metadata +[`zarrs_storage`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_storage +[`zarrs_filesystem`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_filesystem +[`zarrs_http`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_http +[`zarrs_object_store`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_object_store +[`zarrs_opendal`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_opendal +[`zarrs_zip`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_zip +[`zarrs_icechunk`]: https://github.com/LDeakin/zarrs_icechunk +[`zarrs_ffi`]: https://github.com/LDeakin/zarrs_ffi +[`zarrs-python`]: https://github.com/ilan-gold/zarrs-python +[`zarr-python`]: https://github.com/zarr-developers/zarr-python +[`zarrs_tools`]: https://github.com/LDeakin/zarrs_tools +[`ome_zarr_metadata`]: https://github.com/LDeakin/rust_ome_zarr_metadata +[`object_store`]: https://github.com/apache/arrow-rs/tree/main/object_store +[`opendal`]: https://github.com/apache/OpenDAL +[`icechunk`]: https://github.com/earth-mover/icechunk + +[Zarr]: https://zarr.dev +[Zarr V3]: https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html +[Zarr V2]: https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html +[OME-Zarr]: https://ngff.openmicroscopy.org/latest/ + +[Department of Materials Physics, Australian National University, Canberra, Australia]: https://physics.anu.edu.au/research/mp/ diff --git a/zarrs/doc/ecosystem.md b/zarrs/doc/ecosystem.md index a048289d..b132e0f1 100644 --- a/zarrs/doc/ecosystem.md +++ b/zarrs/doc/ecosystem.md @@ -1,57 +1,46 @@ -| Crate | Docs / Description | -| --------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- | -| **Core** | | -| [![zarrs_ver]](https://crates.io/crates/zarrs) [zarrs] | [![docs]](https://docs.rs/zarrs) The core library for manipulating Zarr hierarchies | -| [![zarrs_metadata_ver]](https://crates.io/crates/zarrs_metadata) [zarrs_metadata] | [![docs]](https://docs.rs/zarrs_metadata) Zarr metadata support (re-exported as `zarrs::metadata`) | -| [![zarrs_storage_ver]](https://crates.io/crates/zarrs_storage) [zarrs_storage] | [![docs]](https://docs.rs/zarrs_storage) The storage API for `zarrs` (re-exported as `zarrs::storage`) | -| **Stores** | | -| [![zarrs_filesystem_ver]](https://crates.io/crates/zarrs_filesystem) [zarrs_filesystem] | [![docs]](https://docs.rs/zarrs_filesystem) A filesystem store (re-exported as `zarrs::filesystem`) | -| [![zarrs_object_store_ver]](https://crates.io/crates/zarrs_object_store) [zarrs_object_store] | [![docs]](https://docs.rs/zarrs_object_store) [`object_store`](https://docs.rs/object_store/latest/object_store/) store support | -| [![zarrs_opendal_ver]](https://crates.io/crates/zarrs_opendal) [zarrs_opendal] | [![docs]](https://docs.rs/zarrs_opendal) [`opendal`](https://docs.rs/opendal/latest/opendal/) store support | -| [![zarrs_http_ver]](https://crates.io/crates/zarrs_http) [zarrs_http] | [![docs]](https://docs.rs/zarrs_http) A synchronous http store | -| [![zarrs_zip_ver]](https://crates.io/crates/zarrs_zip) [zarrs_zip] | [![docs]](https://docs.rs/zarrs_zip) A storage adapter for zip files | -| [![zarrs_icechunk_ver]](https://crates.io/crates/zarrs_icechunk) [zarrs_icechunk] | [![docs]](https://docs.rs/zarrs_icechunk) [`icechunk`](https://docs.rs/icechunk/latest/icechunk/) store support | -| **Bindings** | | -| [![zarrs_python_ver]](https://pypi.org/project/zarrs/) [zarrs-python] | [![docs]](https://zarrs-python.readthedocs.io/en/latest/) A codec pipeline for [zarr-python] | -| [![zarrs_ffi_ver]](https://crates.io/crates/zarrs_ffi) [zarrs_ffi] | [![docs]](https://docs.rs/zarrs_ffi) A subset of `zarrs` exposed as a C/C++ API | -| **Zarr Metadata Conventions** | | -| [![ome_zarr_metadata_ver]](https://crates.io/crates/ome_zarr_metadata) [ome_zarr_metadata] | [![docs]](https://docs.rs/ome_zarr_metadata) A library for OME-Zarr (previously OME-NGFF) metadata | +#### Core +- [`zarrs`]: The core library for manipulating Zarr hierarchies. +- [`zarrs_metadata`]: Zarr metadata support (re-exported as `zarrs::metadata`). +- [`zarrs_storage`]: The storage API for `zarrs` (re-exported as `zarrs::storage`). -[docs]: https://img.shields.io/badge/docs-brightgreen -[zarrs_ver]: https://img.shields.io/crates/v/zarrs -[zarrs]: https://github.com/LDeakin/zarrs/tree/main/zarrs -[zarrs_metadata_ver]: https://img.shields.io/crates/v/zarrs_metadata -[zarrs_metadata]: https://github.com/LDeakin/zarrs/tree/main/zarrs_metadata -[zarrs_storage_ver]: https://img.shields.io/crates/v/zarrs_storage -[zarrs_storage]: https://github.com/LDeakin/zarrs/tree/main/zarrs_storage -[zarrs_filesystem_ver]: https://img.shields.io/crates/v/zarrs_filesystem -[zarrs_filesystem]: https://github.com/LDeakin/zarrs/tree/main/zarrs_filesystem -[zarrs_http_ver]: https://img.shields.io/crates/v/zarrs_http -[zarrs_http]: https://github.com/LDeakin/zarrs/tree/main/zarrs_http -[zarrs_object_store_ver]: https://img.shields.io/crates/v/zarrs_object_store -[zarrs_object_store]: https://github.com/LDeakin/zarrs/tree/main/zarrs_object_store -[zarrs_opendal_ver]: https://img.shields.io/crates/v/zarrs_opendal -[zarrs_opendal]: https://github.com/LDeakin/zarrs/tree/main/zarrs_opendal -[zarrs_zip_ver]: https://img.shields.io/crates/v/zarrs_zip -[zarrs_zip]: https://github.com/LDeakin/zarrs/tree/main/zarrs_zip -[zarrs_icechunk_ver]: https://img.shields.io/crates/v/zarrs_icechunk -[zarrs_icechunk]: https://github.com/LDeakin/zarrs_icechunk -[zarrs_ffi_ver]: https://img.shields.io/crates/v/zarrs_ffi -[zarrs_ffi]: https://github.com/LDeakin/zarrs_ffi -[zarrs_python_ver]: https://img.shields.io/pypi/v/zarrs -[zarrs-python]: https://github.com/ilan-gold/zarrs-python -[zarr-python]: https://github.com/zarr-developers/zarr-python -[ome_zarr_metadata_ver]: https://img.shields.io/crates/v/ome_zarr_metadata -[ome_zarr_metadata]: https://github.com/LDeakin/rust_ome_zarr_metadata +#### Stores +- [`zarrs_filesystem`]: A filesystem store (re-exported as `zarrs::filesystem`). +- [`zarrs_object_store`]: [`object_store`] store support. +- [`zarrs_opendal`]: [`opendal`] store support. +- [`zarrs_http`]: A synchronous http store. +- [`zarrs_zip`]: A storage adapter for zip files. +- [`zarrs_icechunk`]: [`icechunk`] store support. -#### [zarrs_tools] -[![zarrs_tools_ver]](https://crates.io/crates/zarrs_tools) [![zarrs_tools_doc]](https://docs.rs/zarrs_tools) +#### Bindings +- [`zarrs-python`]: A high-performance codec pipeline for [`zarr-python`]. +- [`zarrs_ffi`]: A subset of `zarrs` exposed as a C/C++ API. -[zarrs_tools]: https://github.com/LDeakin/zarrs_tools -[zarrs_tools_ver]: https://img.shields.io/crates/v/zarrs_tools.svg -[zarrs_tools_doc]: https://docs.rs/zarrs_tools/badge.svg +#### Zarr Metadata Conventions +- [`ome_zarr_metadata`]: A library for OME-Zarr (previously OME-NGFF) metadata. +#### Tools +- [`zarrs_tools`]: Various tools for creating and manipulating Zarr V3 data with the zarrs rust crate - A reencoder that can change codecs, chunk shape, convert Zarr V2 to V3, etc. - - Create an [OME-Zarr](https://ngff.openmicroscopy.org/latest/) hierarchy from a Zarr array. + - Create an [OME-Zarr] hierarchy from a Zarr array. - Transform arrays: crop, rescale, downsample, gradient magnitude, gaussian, noise filtering, etc. - Benchmarking tools and performance benchmarks of `zarrs`. + +[`zarrs`]: https://github.com/LDeakin/zarrs/tree/main/zarrs +[`zarrs_metadata`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_metadata +[`zarrs_storage`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_storage +[`zarrs_filesystem`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_filesystem +[`zarrs_http`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_http +[`zarrs_object_store`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_object_store +[`zarrs_opendal`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_opendal +[`zarrs_zip`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_zip +[`zarrs_icechunk`]: https://github.com/LDeakin/zarrs_icechunk +[`zarrs_ffi`]: https://github.com/LDeakin/zarrs_ffi +[`zarrs-python`]: https://github.com/ilan-gold/zarrs-python +[`zarr-python`]: https://github.com/zarr-developers/zarr-python +[`zarrs_tools`]: https://github.com/LDeakin/zarrs_tools +[`ome_zarr_metadata`]: https://github.com/LDeakin/rust_ome_zarr_metadata +[`object_store`]: https://github.com/apache/arrow-rs/tree/main/object_store +[`opendal`]: https://github.com/apache/OpenDAL +[`icechunk`]: https://github.com/earth-mover/icechunk + +[OME-Zarr]: https://ngff.openmicroscopy.org/latest/ diff --git a/zarrs/doc/status/ZEPs.md b/zarrs/doc/status/ZEPs.md deleted file mode 100644 index fd653605..00000000 --- a/zarrs/doc/status/ZEPs.md +++ /dev/null @@ -1,14 +0,0 @@ -| [Zarr Enhancement Proposal] | Status | Zarrs | -| --------------------------------------- | -------------------------- | ------------ | -| [ZEP0001]: Zarr specification version 3 | Accepted | Full support | -| [ZEP0002]: Sharding codec | Accepted | Full support | -| Draft [ZEP0003]: Variable chunking | [zarr-developers #52] | Full support | -| Draft ZEP0007: Strings | [zarr-developers/zeps #47] | Prototype | - -[Zarr Enhancement Proposal]: https://zarr.dev/zeps/ -[ZEP0001]: https://zarr.dev/zeps/accepted/ZEP0001.html -[ZEP0002]: https://zarr.dev/zeps/accepted/ZEP0002.html -[ZEP0003]: https://zarr.dev/zeps/draft/ZEP0003.html - -[zarr-developers #52]: https://github.com/orgs/zarr-developers/discussions/52 -[zarr-developers/zeps #47]: https://github.com/zarr-developers/zeps/pull/47#issuecomment-1710505141 diff --git a/zarrs/src/lib.rs b/zarrs/src/lib.rs index 8b110100..29586900 100644 --- a/zarrs/src/lib.rs +++ b/zarrs/src/lib.rs @@ -10,27 +10,22 @@ //! //! ## Getting Started //! - Review the [implementation status](#implementation-status), [array support](#array-support), and [storage support](#storage-support). -//! - Read [The `zarrs` Book](https://book.zarrs.dev). +//! - Read [The `zarrs` Book]. //! - View the [examples](https://github.com/LDeakin/zarrs/tree/main/zarrs/examples) and [the example below](#examples). //! - Read the [documentation](https://docs.rs/zarrs/latest/zarrs/). [`array::Array`] is a good place to start. //! - Check out the [`zarrs` ecosystem](#zarrs-ecosystem). //! //! ## Implementation Status //! -#![doc = include_str!("../doc/status/ZEPs.md")] +//! #### Zarr Version Support //! //! `zarrs` has first-class Zarr V3 support and additionally supports a *compatible subset* of Zarr V2 data that: //! - can be converted to V3 with only a metadata change, and //! - uses array metadata that is recognised and supported for encoding/decoding. //! -//! An existing V2 or V3 array can be opened with [`Array::open`](crate::array::Array::open). -//! A new array can be created from V2 or V3 metadata with [`Array::new_with_metadata`](crate::array::Array::new_with_metadata). -//! The [`ArrayBuilder`](crate::array::ArrayBuilder) only supports V3 array creation. +//! `zarrs` supports forward conversion from Zarr V2 to V3. See ["Converting Zarr V2 to V3"](https://book.zarrs.dev/v2_to_v3.html) in [The `zarrs` Book], or try the [`zarrs_reencode`](https://github.com/LDeakin/zarrs_tools/blob/main/docs/zarrs_reencode.md) CLI tool. //! -//! `zarrs` supports forward conversion of Zarr V2 data to V3. -//! See ["Metadata Convert Version"](crate::config::Config#metadata-convert-version) and ["Metadata Erase Version"](crate::config::Config#metadata-erase-version) for information about manipulating the version of array/group metadata. -//! -//! ### Array Support +//! #### Array Support //! //!
Data Types //! @@ -62,7 +57,7 @@ #![doc = include_str!("../doc/status/storage_transformers.md")] //!
//! -//! ### Storage Support +//! #### Storage Support //! //! `zarrs` supports stores (filesystem, HTTP, S3, etc.) via crates implementing the [`zarrs_storage`] API. //! @@ -182,6 +177,8 @@ //! - the MIT license [LICENSE-MIT](https://docs.rs/crate/zarrs/latest/source/LICENCE-MIT) or , at your option. //! //! Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions. +//! +//! [The `zarrs` Book]: https://book.zarrs.dev #![cfg_attr(docsrs, feature(doc_auto_cfg))] pub mod array; From 2f5a66520445f482695db6b5b51c2f83333f7dd1 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Fri, 17 Jan 2025 08:23:33 +1100 Subject: [PATCH 13/45] chore(docs): document that elements in `ArrayBytes` must be in C-contiguous order --- CHANGELOG.md | 3 +++ zarrs/src/array/array_bytes.rs | 23 +++++++++++++++++++---- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 14be0d1e..baea6c29 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- Document that elements in `ArrayBytes` must be in C-contiguous order + ### Changed - Use new language/library features added between Rust 1.78-1.82 (internal) - Cleanup root docs and README removing ZEPs table and ecosystem table diff --git a/zarrs/src/array/array_bytes.rs b/zarrs/src/array/array_bytes.rs index 51db02c9..53c4e7b5 100644 --- a/zarrs/src/array/array_bytes.rs +++ b/zarrs/src/array/array_bytes.rs @@ -13,19 +13,30 @@ use crate::{ use super::{codec::CodecError, ravel_indices, ArraySize, DataType, FillValue}; /// Array element bytes. +/// +/// These can represent: +/// - [`ArrayBytes::Fixed`]: fixed length elements of an array in C-contiguous order, +/// - [`ArrayBytes::Variable`]: variable length elements of an array in C-contiguous order with padding permitted, +/// - Encoded array bytes after an array to bytes or bytes to bytes codecs. pub type RawBytes<'a> = Cow<'a, [u8]>; /// Array element byte offsets. -pub type RawBytesOffsets<'a> = Cow<'a, [usize]>; +/// +/// These must be monotonically increasing. See [`ArrayBytes::Variable`]. +pub type RawBytesOffsets<'a> = Cow<'a, [usize]>; // FIXME: Switch to a validated newtype in zarrs 0.20 /// Fixed or variable length array bytes. -/// -/// Offsets are [`None`] if bytes are composed of fixed size data types. #[derive(Clone, Debug, PartialEq, Eq)] pub enum ArrayBytes<'a> { /// Bytes for a fixed length array. + /// + /// These represent elements in C-contiguous order (i.e. row-major order) where the last dimension varies the fastest. Fixed(RawBytes<'a>), /// Bytes and element byte offsets for a variable length array. + /// + /// The bytes and offsets are modeled on the [Apache Arrow Variable-size Binary Layout](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout). + /// - The offsets buffer contains length + 1 ~~signed integers (either 32-bit or 64-bit, depending on the data type)~~ usize integers. + /// - Offsets must be monotonically increasing, that is `offsets[j+1] >= offsets[j]` for `0 <= j < length`, even for null slots. Thus, the bytes represent C-contiguous elements with padding permitted. Variable(RawBytes<'a>, RawBytesOffsets<'a>), } @@ -39,6 +50,8 @@ pub enum ArrayBytesError { impl<'a> ArrayBytes<'a> { /// Create a new fixed length array bytes from `bytes`. + /// + /// `bytes` must be C-contiguous. pub fn new_flen(bytes: impl Into>) -> Self { Self::Fixed(bytes.into()) } @@ -46,11 +59,13 @@ impl<'a> ArrayBytes<'a> { /// Create a new variable length array bytes from `bytes` and `offsets`. pub fn new_vlen( bytes: impl Into>, - offsets: impl Into>, + offsets: impl Into>, // FIXME: TryInto ) -> Self { Self::Variable(bytes.into(), offsets.into()) } + // TODO: new_vlen_unchecked + /// Create a new [`ArrayBytes`] with `num_elements` composed entirely of the `fill_value`. /// /// # Panics From b493c5e7c489d047290684509923d957dc082f96 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Sat, 18 Jan 2025 09:45:10 +1100 Subject: [PATCH 14/45] fix(docs): mark `String` and `Bytes` data types as experimental This was already documented in the crate root docs --- CHANGELOG.md | 1 + zarrs/src/array/data_type.rs | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index baea6c29..7b47509d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - New clippy lints +- Mark `String` and `Bytes` data types as experimental in their docs ## [0.19.0] - 2025-01-10 diff --git a/zarrs/src/array/data_type.rs b/zarrs/src/array/data_type.rs index 478a8907..0ca4300e 100644 --- a/zarrs/src/array/data_type.rs +++ b/zarrs/src/array/data_type.rs @@ -53,9 +53,13 @@ pub enum DataType { Complex128, /// `r*` raw bits, variable size given by *, limited to be a multiple of 8. RawBits(usize), // the stored usize is the size in bytes - /// A UTF-8 encoded string. + /// A UTF-8 encoded string. **Experimental**. + /// + /// This data type is not standardised in the Zarr V3 specification. String, - /// Variable-sized binary data. + /// Variable-sized binary data. **Experimental**. + /// + /// This data type is not standardised in the Zarr V3 specification. Bytes, } From aecaffb1183665419c9ea9a3f0b8f34411cbc600 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Sat, 18 Jan 2025 09:48:38 +1100 Subject: [PATCH 15/45] fix(docs): mark `rectangular` chunk grid as experimental since it is based on a draft ZEP --- CHANGELOG.md | 1 + zarrs/doc/status/chunk_grids.md | 8 ++++---- zarrs/src/array/chunk_grid/rectangular.rs | 2 ++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b47509d..fc854e03 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - New clippy lints - Mark `String` and `Bytes` data types as experimental in their docs +- Mark `rectangular` chunk grid as experimental since it is based on a draft ZEP ## [0.19.0] - 2025-01-10 diff --git a/zarrs/doc/status/chunk_grids.md b/zarrs/doc/status/chunk_grids.md index 3de5024e..ff033044 100644 --- a/zarrs/doc/status/chunk_grids.md +++ b/zarrs/doc/status/chunk_grids.md @@ -1,7 +1,7 @@ -| Chunk Grid | ZEP | V3 | V2 | Feature Flag | -| ------------- | --------- | ------- | ------- | ------------ | -| [regular] | [ZEP0001] | ✓ | ✓ | | -| [rectangular] | [ZEP0003] | ✓ | | | +| Chunk Grid | ZEP | V3 | V2 | Feature Flag | +| ---------------------------- | ----------------- | ------- | ------- | ------------ | +| [regular] | [ZEP0001] | ✓ | ✓ | | +| [rectangular] (experimental) | [ZEP0003] (draft) | ✓ | | | [regular]: crate::array::chunk_grid::RegularChunkGrid [rectangular]: crate::array::chunk_grid::RectangularChunkGrid diff --git a/zarrs/src/array/chunk_grid/rectangular.rs b/zarrs/src/array/chunk_grid/rectangular.rs index 05733fb5..6dcedd59 100644 --- a/zarrs/src/array/chunk_grid/rectangular.rs +++ b/zarrs/src/array/chunk_grid/rectangular.rs @@ -1,5 +1,7 @@ //! The `rectangular` chunk grid. //! +//! This chunk grid is considered experimental as it is based on a draft Zarr enhancement proposal. +//! //! See . use std::num::NonZeroU64; From acf55bbc20aad473bd402233e8e6a099c1e77c0d Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Sat, 18 Jan 2025 09:55:46 +1100 Subject: [PATCH 16/45] fix(docs): add missing invariant to `[partial_]decode_into` safety docs --- CHANGELOG.md | 1 + zarrs/src/array/codec.rs | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fc854e03..711cd97a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - New clippy lints - Mark `String` and `Bytes` data types as experimental in their docs - Mark `rectangular` chunk grid as experimental since it is based on a draft ZEP +- Add missing invariant to `[partial_]decode_into` safety docs ## [0.19.0] - 2025-01-10 diff --git a/zarrs/src/array/codec.rs b/zarrs/src/array/codec.rs index 12d09bb7..1850b43c 100644 --- a/zarrs/src/array/codec.rs +++ b/zarrs/src/array/codec.rs @@ -364,8 +364,9 @@ pub trait ArrayPartialDecoderTraits: Any + Send + Sync { /// # Safety /// The caller must ensure that: /// - `output` holds enough space for the preallocated bytes of an array with shape `output_shape` of the appropriate data type, - /// - `output_subset` is within the bounds of `output_shape`, and - /// - `output_subset` has the same number of elements as `array_subset`. + /// - `output_subset` is within the bounds of `output_shape`, + /// - `output_subset` has the same number of elements as `array_subset`, and + /// - `output_subset`s must be non-overlapping when called in parallel on the same `output`. unsafe fn partial_decode_into( &self, array_subset: &ArraySubset, @@ -715,8 +716,9 @@ pub trait ArrayToBytesCodecTraits: ArrayCodecTraits + core::fmt::Debug { /// # Safety /// The caller must ensure that: /// - `output` holds enough space for the preallocated bytes of an array with shape `output_shape` of the appropriate data type, and - /// - `output_subset` is within the bounds of `output_shape`, and - /// - `output_subset` has the same number of elements as the decoded representation shape. + /// - `output_subset` is within the bounds of `output_shape`, + /// - `output_subset` has the same number of elements as the decoded representation shape, and + /// - `output_subset`s must be non-overlapping when called in parallel on the same `output`. unsafe fn decode_into( &self, bytes: RawBytes<'_>, From 997178778bf2a0df8ca53cdc4dca8f2e871879c8 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Sun, 19 Jan 2025 07:37:03 +1100 Subject: [PATCH 17/45] Prepare 0.19.1 release --- CHANGELOG.md | 5 ++++- CITATION.cff | 4 ++-- zarrs/Cargo.toml | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 711cd97a..2db1ecfd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.19.1] - 2025-01-19 + ### Added - Document that elements in `ArrayBytes` must be in C-contiguous order @@ -1228,7 +1230,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Initial public release -[unreleased]: https://github.com/LDeakin/zarrs/compare/zarrs-v0.19.0...HEAD +[unreleased]: https://github.com/LDeakin/zarrs/compare/zarrs-v0.19.1...HEAD +[0.19.1]: https://github.com/LDeakin/zarrs/releases/tag/zarrs-v0.19.1 [0.19.0]: https://github.com/LDeakin/zarrs/releases/tag/zarrs-v0.19.0 [0.18.3]: https://github.com/LDeakin/zarrs/releases/tag/zarrs-v0.18.3 [0.18.2]: https://github.com/LDeakin/zarrs/releases/tag/zarrs-v0.18.2 diff --git a/CITATION.cff b/CITATION.cff index 44437c92..36903046 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,8 +1,8 @@ cff-version: 1.2.0 message: "If you use this software, please cite it as below." title: "zarrs" -version: 0.19.0 -date-released: 2025-01-10 +version: 0.19.1 +date-released: 2025-01-19 repository-code: "https://github.com/LDeakin/zarrs" url: "https://zarrs.dev" abstract: "zarrs is a Rust library for the Zarr storage format for multidimensional arrays and metadata." diff --git a/zarrs/Cargo.toml b/zarrs/Cargo.toml index d3c3168a..18f8596b 100644 --- a/zarrs/Cargo.toml +++ b/zarrs/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "zarrs" -version = "0.19.0" +version = "0.19.1" authors = ["Lachlan Deakin "] edition = "2021" rust-version = "1.82" From 3b705377e57316f187f00ba7c2791e9366864443 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Sun, 19 Jan 2025 16:02:29 +1100 Subject: [PATCH 18/45] Increment version to 0.20.0-dev --- zarrs/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarrs/Cargo.toml b/zarrs/Cargo.toml index 18f8596b..e4cee82d 100644 --- a/zarrs/Cargo.toml +++ b/zarrs/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "zarrs" -version = "0.19.1" +version = "0.20.0-dev" authors = ["Lachlan Deakin "] edition = "2021" rust-version = "1.82" From 862604b2a316c1132d22935d0c004870fbf82ff5 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Sun, 19 Jan 2025 16:04:22 +1100 Subject: [PATCH 19/45] refactor!: add `CodecMetadataOptions` (#131) --- zarrs/src/array.rs | 2 +- zarrs/src/array/array_metadata_options.rs | 28 +++++-------- zarrs/src/array/codec.rs | 10 +++-- .../array_to_array/bitround/bitround_codec.rs | 6 +-- .../transpose/transpose_codec.rs | 6 +-- .../codec/array_to_bytes/bytes/bytes_codec.rs | 7 ++-- .../array/codec/array_to_bytes/codec_chain.rs | 12 +++--- .../array_to_bytes/pcodec/pcodec_codec.rs | 10 ++--- .../array_to_bytes/sharding/sharding_codec.rs | 9 +++-- .../codec/array_to_bytes/vlen/vlen_codec.rs | 10 ++--- .../array_to_bytes/vlen_v2/vlen_v2_codec.rs | 7 ++-- .../array_to_bytes/vlen_v2/vlen_v2_macros.rs | 9 +++-- .../codec/array_to_bytes/zfp/zfp_codec.rs | 8 ++-- .../codec/bytes_to_bytes/blosc/blosc_codec.rs | 7 ++-- .../codec/bytes_to_bytes/bz2/bz2_codec.rs | 7 ++-- .../bytes_to_bytes/crc32c/crc32c_codec.rs | 7 ++-- .../fletcher32/fletcher32_codec.rs | 7 ++-- .../bytes_to_bytes/gdeflate/gdeflate_codec.rs | 6 +-- .../codec/bytes_to_bytes/gzip/gzip_codec.rs | 7 ++-- .../test_unbounded/test_unbounded_codec.rs | 7 ++-- .../codec/bytes_to_bytes/zstd/zstd_codec.rs | 7 ++-- zarrs/src/array/codec/metadata_options.rs | 39 +++++++++++++++++++ 22 files changed, 129 insertions(+), 89 deletions(-) create mode 100644 zarrs/src/array/codec/metadata_options.rs diff --git a/zarrs/src/array.rs b/zarrs/src/array.rs index c85aff98..e810b4dd 100644 --- a/zarrs/src/array.rs +++ b/zarrs/src/array.rs @@ -603,7 +603,7 @@ impl Array { // Codec metadata manipulation match &mut metadata { ArrayMetadata::V3(metadata) => { - metadata.codecs = self.codecs().create_metadatas_opt(options); + metadata.codecs = self.codecs().create_metadatas_opt(options.codec_options()); } ArrayMetadata::V2(_metadata) => { // NOTE: The codec related options in ArrayMetadataOptions do not impact V2 codecs diff --git a/zarrs/src/array/array_metadata_options.rs b/zarrs/src/array/array_metadata_options.rs index 2ec189a0..e0dafc36 100644 --- a/zarrs/src/array/array_metadata_options.rs +++ b/zarrs/src/array/array_metadata_options.rs @@ -1,9 +1,11 @@ use crate::config::{global_config, MetadataConvertVersion}; +use super::codec::CodecMetadataOptions; + /// Options for writing array metadata. #[derive(Debug, Clone)] pub struct ArrayMetadataOptions { - experimental_codec_store_metadata_if_encode_only: bool, + codec_options: CodecMetadataOptions, convert_version: MetadataConvertVersion, include_zarrs_metadata: bool, } @@ -11,7 +13,7 @@ pub struct ArrayMetadataOptions { impl Default for ArrayMetadataOptions { fn default() -> Self { Self { - experimental_codec_store_metadata_if_encode_only: false, + codec_options: CodecMetadataOptions::default(), convert_version: global_config().metadata_convert_version(), include_zarrs_metadata: global_config().include_zarrs_metadata(), } @@ -19,26 +21,16 @@ impl Default for ArrayMetadataOptions { } impl ArrayMetadataOptions { - /// Return the [experimental codec store metadata if encode only](crate::config::Config#experimental-codec-store-metadata-if-encode-only) setting. + /// Return the codec options. #[must_use] - pub fn experimental_codec_store_metadata_if_encode_only(&self) -> bool { - self.experimental_codec_store_metadata_if_encode_only + pub fn codec_options(&self) -> &CodecMetadataOptions { + &self.codec_options } - /// Set the [experimental codec store metadata if encode only](crate::config::Config#experimental-codec-store-metadata-if-encode-only) setting. + /// Return a mutable reference to the codec options. #[must_use] - pub fn with_experimental_codec_store_metadata_if_encode_only(mut self, enabled: bool) -> Self { - self.experimental_codec_store_metadata_if_encode_only = enabled; - self - } - - /// Set the [experimental codec store metadata if encode only](crate::config::Config#experimental-codec-store-metadata-if-encode-only) setting. - pub fn set_experimental_codec_store_metadata_if_encode_only( - &mut self, - enabled: bool, - ) -> &mut Self { - self.experimental_codec_store_metadata_if_encode_only = enabled; - self + pub fn codec_options_mut(&mut self) -> &mut CodecMetadataOptions { + &mut self.codec_options } /// Get the [metadata convert version](crate::config::Config#metadata-convert-version) configuration. diff --git a/zarrs/src/array/codec.rs b/zarrs/src/array/codec.rs index 1850b43c..f79facb0 100644 --- a/zarrs/src/array/codec.rs +++ b/zarrs/src/array/codec.rs @@ -13,8 +13,10 @@ pub mod array_to_array; pub mod array_to_bytes; pub mod bytes_to_bytes; +pub mod metadata_options; pub mod options; +pub use metadata_options::CodecMetadataOptions; pub use options::{CodecOptions, CodecOptionsBuilder}; // Array to array @@ -96,8 +98,8 @@ use std::sync::Arc; use super::array_bytes::update_bytes_flen; use super::{ - concurrency::RecommendedConcurrency, ArrayMetadataOptions, BytesRepresentation, - ChunkRepresentation, ChunkShape, DataType, + concurrency::RecommendedConcurrency, BytesRepresentation, ChunkRepresentation, ChunkShape, + DataType, }; use super::{ArrayBytes, RawBytes}; @@ -199,13 +201,13 @@ pub trait CodecTraits: Send + Sync { /// Create metadata. /// /// A hidden codec (e.g. a cache) will return [`None`], since it will not have any associated metadata. - fn create_metadata_opt(&self, options: &ArrayMetadataOptions) -> Option; + fn create_metadata_opt(&self, options: &CodecMetadataOptions) -> Option; /// Create metadata with default options. /// /// A hidden codec (e.g. a cache) will return [`None`], since it will not have any associated metadata. fn create_metadata(&self) -> Option { - self.create_metadata_opt(&ArrayMetadataOptions::default()) + self.create_metadata_opt(&CodecMetadataOptions::default()) } /// Indicates if the input to a codecs partial decoder should be cached for optimal performance. diff --git a/zarrs/src/array/codec/array_to_array/bitround/bitround_codec.rs b/zarrs/src/array/codec/array_to_array/bitround/bitround_codec.rs index 51c8249f..fc288375 100644 --- a/zarrs/src/array/codec/array_to_array/bitround/bitround_codec.rs +++ b/zarrs/src/array/codec/array_to_array/bitround/bitround_codec.rs @@ -5,9 +5,9 @@ use crate::{ codec::{ options::CodecOptions, ArrayBytes, ArrayCodecTraits, ArrayPartialDecoderTraits, ArrayPartialEncoderTraits, ArrayToArrayCodecTraits, ArrayToArrayPartialEncoderDefault, - CodecError, CodecTraits, RecommendedConcurrency, + CodecError, CodecMetadataOptions, CodecTraits, RecommendedConcurrency, }, - ArrayMetadataOptions, ChunkRepresentation, ChunkShape, DataType, + ChunkRepresentation, ChunkShape, DataType, }, config::global_config, metadata::v3::MetadataV3, @@ -47,7 +47,7 @@ impl BitroundCodec { } impl CodecTraits for BitroundCodec { - fn create_metadata_opt(&self, options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, options: &CodecMetadataOptions) -> Option { if options.experimental_codec_store_metadata_if_encode_only() { let configuration = BitroundCodecConfigurationV1 { keepbits: self.keepbits, diff --git a/zarrs/src/array/codec/array_to_array/transpose/transpose_codec.rs b/zarrs/src/array/codec/array_to_array/transpose/transpose_codec.rs index 1ac8a968..85032f30 100644 --- a/zarrs/src/array/codec/array_to_array/transpose/transpose_codec.rs +++ b/zarrs/src/array/codec/array_to_array/transpose/transpose_codec.rs @@ -5,9 +5,9 @@ use crate::{ codec::{ options::CodecOptions, ArrayBytes, ArrayCodecTraits, ArrayPartialDecoderTraits, ArrayPartialEncoderTraits, ArrayToArrayCodecTraits, ArrayToArrayPartialEncoderDefault, - CodecError, CodecTraits, RecommendedConcurrency, + CodecError, CodecMetadataOptions, CodecTraits, RecommendedConcurrency, }, - ArrayMetadataOptions, ChunkRepresentation, ChunkShape, + ChunkRepresentation, ChunkShape, }, metadata::v3::{array::codec::transpose::TransposeCodecConfigurationV1, MetadataV3}, plugin::PluginCreateError, @@ -48,7 +48,7 @@ impl TransposeCodec { } impl CodecTraits for TransposeCodec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let configuration = TransposeCodecConfigurationV1 { order: self.order.clone(), }; diff --git a/zarrs/src/array/codec/array_to_bytes/bytes/bytes_codec.rs b/zarrs/src/array/codec/array_to_bytes/bytes/bytes_codec.rs index f6cff4a3..ce379a00 100644 --- a/zarrs/src/array/codec/array_to_bytes/bytes/bytes_codec.rs +++ b/zarrs/src/array/codec/array_to_bytes/bytes/bytes_codec.rs @@ -7,11 +7,10 @@ use crate::{ codec::{ ArrayCodecTraits, ArrayPartialDecoderTraits, ArrayPartialEncoderDefault, ArrayPartialEncoderTraits, ArrayToBytesCodecTraits, BytesPartialDecoderTraits, - BytesPartialEncoderTraits, CodecError, CodecOptions, CodecTraits, + BytesPartialEncoderTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, RecommendedConcurrency, }, - ArrayBytes, ArrayMetadataOptions, BytesRepresentation, ChunkRepresentation, DataTypeSize, - RawBytes, + ArrayBytes, BytesRepresentation, ChunkRepresentation, DataTypeSize, RawBytes, }, metadata::v3::MetadataV3, }; @@ -101,7 +100,7 @@ impl BytesCodec { } impl CodecTraits for BytesCodec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let configuration = BytesCodecConfigurationV1 { endian: self.endian, }; diff --git a/zarrs/src/array/codec/array_to_bytes/codec_chain.rs b/zarrs/src/array/codec/array_to_bytes/codec_chain.rs index be8fae6a..804b683f 100644 --- a/zarrs/src/array/codec/array_to_bytes/codec_chain.rs +++ b/zarrs/src/array/codec/array_to_bytes/codec_chain.rs @@ -11,11 +11,11 @@ use crate::{ ArrayCodecTraits, ArrayPartialDecoderCache, ArrayPartialDecoderTraits, ArrayPartialEncoderTraits, ArrayToArrayCodecTraits, ArrayToBytesCodecTraits, BytesPartialDecoderCache, BytesPartialDecoderTraits, BytesPartialEncoderTraits, - BytesToBytesCodecTraits, Codec, CodecError, CodecOptions, CodecTraits, + BytesToBytesCodecTraits, Codec, CodecError, CodecMetadataOptions, CodecOptions, + CodecTraits, }, concurrency::RecommendedConcurrency, - ArrayBytes, ArrayMetadataOptions, BytesRepresentation, ChunkRepresentation, ChunkShape, - RawBytes, + ArrayBytes, BytesRepresentation, ChunkRepresentation, ChunkShape, RawBytes, }, array_subset::ArraySubset, metadata::v3::MetadataV3, @@ -137,7 +137,7 @@ impl CodecChain { /// Create codec chain metadata. #[must_use] - pub fn create_metadatas_opt(&self, options: &ArrayMetadataOptions) -> Vec { + pub fn create_metadatas_opt(&self, options: &CodecMetadataOptions) -> Vec { let mut metadatas = Vec::with_capacity(self.array_to_array.len() + 1 + self.bytes_to_bytes.len()); for codec in &self.array_to_array { @@ -159,7 +159,7 @@ impl CodecChain { /// Create codec chain metadata with default options. #[must_use] pub fn create_metadatas(&self) -> Vec { - self.create_metadatas_opt(&ArrayMetadataOptions::default()) + self.create_metadatas_opt(&CodecMetadataOptions::default()) } /// Get the array to array codecs @@ -215,7 +215,7 @@ impl CodecTraits for CodecChain { /// Returns [`None`] since a codec chain does not have standard codec metadata. /// /// Note that usage of the codec chain is explicit in [`Array`](crate::array::Array) and [`CodecChain::create_metadatas_opt()`] will call [`CodecTraits::create_metadata_opt()`] from for each codec. - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { None } diff --git a/zarrs/src/array/codec/array_to_bytes/pcodec/pcodec_codec.rs b/zarrs/src/array/codec/array_to_bytes/pcodec/pcodec_codec.rs index 10c9f6db..94b47549 100644 --- a/zarrs/src/array/codec/array_to_bytes/pcodec/pcodec_codec.rs +++ b/zarrs/src/array/codec/array_to_bytes/pcodec/pcodec_codec.rs @@ -10,11 +10,11 @@ use crate::{ codec::{ ArrayBytes, ArrayCodecTraits, ArrayPartialDecoderTraits, ArrayPartialEncoderDefault, ArrayPartialEncoderTraits, ArrayToBytesCodecTraits, BytesPartialDecoderTraits, - BytesPartialEncoderTraits, CodecError, CodecOptions, CodecTraits, RawBytes, - RecommendedConcurrency, + BytesPartialEncoderTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, + RawBytes, RecommendedConcurrency, }, - convert_from_bytes_slice, transmute_to_bytes_vec, ArrayMetadataOptions, - BytesRepresentation, ChunkRepresentation, DataType, + convert_from_bytes_slice, transmute_to_bytes_vec, BytesRepresentation, ChunkRepresentation, + DataType, }, config::global_config, metadata::v3::{array::codec::pcodec::PcodecModeSpecConfiguration, MetadataV3}, @@ -84,7 +84,7 @@ impl PcodecCodec { } impl CodecTraits for PcodecCodec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let mode_spec = mode_spec_pco_to_config(&self.chunk_config.mode_spec); let (delta_spec, delta_encoding_order) = match self.chunk_config.delta_spec { DeltaSpec::Auto => (PcodecDeltaSpecConfiguration::Auto, None), diff --git a/zarrs/src/array/codec/array_to_bytes/sharding/sharding_codec.rs b/zarrs/src/array/codec/array_to_bytes/sharding/sharding_codec.rs index f6ec782f..f432cc02 100644 --- a/zarrs/src/array/codec/array_to_bytes/sharding/sharding_codec.rs +++ b/zarrs/src/array/codec/array_to_bytes/sharding/sharding_codec.rs @@ -11,11 +11,12 @@ use crate::{ codec::{ ArrayCodecTraits, ArrayPartialDecoderTraits, ArrayPartialEncoderTraits, ArrayToBytesCodecTraits, BytesPartialDecoderTraits, BytesPartialEncoderTraits, - CodecChain, CodecError, CodecOptions, CodecTraits, RecommendedConcurrency, + CodecChain, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, + RecommendedConcurrency, }, concurrency::calc_concurrency_outer_inner, - transmute_to_bytes_vec, unravel_index, ArrayBytes, ArrayMetadataOptions, ArraySize, - BytesRepresentation, ChunkRepresentation, ChunkShape, DataTypeSize, FillValue, RawBytes, + transmute_to_bytes_vec, unravel_index, ArrayBytes, ArraySize, BytesRepresentation, + ChunkRepresentation, ChunkShape, DataTypeSize, FillValue, RawBytes, }, array_subset::ArraySubset, metadata::v3::MetadataV3, @@ -85,7 +86,7 @@ impl ShardingCodec { } impl CodecTraits for ShardingCodec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let configuration = ShardingCodecConfigurationV1 { chunk_shape: self.chunk_shape.clone(), codecs: self.inner_codecs.create_metadatas(), diff --git a/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs b/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs index 18d389f7..cc3c44e2 100644 --- a/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs +++ b/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs @@ -5,11 +5,11 @@ use crate::{ codec::{ ArrayCodecTraits, ArrayPartialDecoderTraits, ArrayPartialEncoderDefault, ArrayPartialEncoderTraits, ArrayToBytesCodecTraits, BytesCodec, - BytesPartialDecoderTraits, BytesPartialEncoderTraits, CodecError, CodecOptions, - CodecTraits, RecommendedConcurrency, + BytesPartialDecoderTraits, BytesPartialEncoderTraits, CodecError, CodecMetadataOptions, + CodecOptions, CodecTraits, RecommendedConcurrency, }, - transmute_to_bytes_vec, ArrayBytes, ArrayMetadataOptions, BytesRepresentation, - ChunkRepresentation, CodecChain, DataType, DataTypeSize, Endianness, FillValue, RawBytes, + transmute_to_bytes_vec, ArrayBytes, BytesRepresentation, ChunkRepresentation, CodecChain, + DataType, DataTypeSize, Endianness, FillValue, RawBytes, }, config::global_config, metadata::v3::{array::codec::vlen::VlenIndexDataType, MetadataV3}, @@ -83,7 +83,7 @@ impl VlenCodec { } impl CodecTraits for VlenCodec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let configuration = VlenCodecConfigurationV1 { index_codecs: self.index_codecs.create_metadatas(), data_codecs: self.data_codecs.create_metadatas(), diff --git a/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs b/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs index dec590c9..def1a44d 100644 --- a/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs +++ b/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs @@ -7,11 +7,10 @@ use crate::{ codec::{ ArrayCodecTraits, ArrayPartialDecoderTraits, ArrayPartialEncoderDefault, ArrayPartialEncoderTraits, ArrayToBytesCodecTraits, BytesPartialDecoderTraits, - BytesPartialEncoderTraits, CodecError, CodecOptions, CodecTraits, + BytesPartialEncoderTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, RecommendedConcurrency, }, - ArrayBytes, ArrayMetadataOptions, BytesRepresentation, ChunkRepresentation, DataTypeSize, - RawBytes, + ArrayBytes, BytesRepresentation, ChunkRepresentation, DataTypeSize, RawBytes, }, config::global_config, metadata::v3::MetadataV3, @@ -35,7 +34,7 @@ impl VlenV2Codec { } impl CodecTraits for VlenV2Codec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let config = global_config(); let name = config .experimental_codec_names() diff --git a/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_macros.rs b/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_macros.rs index 0ea12587..40a887be 100644 --- a/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_macros.rs +++ b/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_macros.rs @@ -45,10 +45,11 @@ macro_rules! vlen_v2_codec { codec::{ array_to_bytes::vlen_v2::VlenV2Codec, ArrayPartialDecoderTraits, ArrayPartialEncoderTraits, ArrayToBytesCodecTraits, BytesPartialDecoderTraits, - BytesPartialEncoderTraits, CodecError, CodecOptions, CodecTraits, + BytesPartialEncoderTraits, CodecError, CodecMetadataOptions, CodecOptions, + CodecTraits, }, - ArrayBytes, ArrayCodecTraits, ArrayMetadataOptions, BytesRepresentation, - ChunkRepresentation, RawBytes, RecommendedConcurrency, + ArrayBytes, ArrayCodecTraits, BytesRepresentation, ChunkRepresentation, RawBytes, + RecommendedConcurrency, }; #[cfg(feature = "async")] @@ -77,7 +78,7 @@ macro_rules! vlen_v2_codec { } impl CodecTraits for $struct { - fn create_metadata_opt(&self, options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, options: &CodecMetadataOptions) -> Option { self.inner.create_metadata_opt(options) } diff --git a/zarrs/src/array/codec/array_to_bytes/zfp/zfp_codec.rs b/zarrs/src/array/codec/array_to_bytes/zfp/zfp_codec.rs index 8a7a1406..81294871 100644 --- a/zarrs/src/array/codec/array_to_bytes/zfp/zfp_codec.rs +++ b/zarrs/src/array/codec/array_to_bytes/zfp/zfp_codec.rs @@ -15,10 +15,10 @@ use crate::{ codec::{ ArrayBytes, ArrayCodecTraits, ArrayPartialDecoderTraits, ArrayPartialEncoderDefault, ArrayPartialEncoderTraits, ArrayToBytesCodecTraits, BytesPartialDecoderTraits, - BytesPartialEncoderTraits, CodecError, CodecOptions, CodecTraits, RawBytes, - RecommendedConcurrency, + BytesPartialEncoderTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, + RawBytes, RecommendedConcurrency, }, - ArrayMetadataOptions, BytesRepresentation, ChunkRepresentation, DataType, + BytesRepresentation, ChunkRepresentation, DataType, }, config::global_config, metadata::v3::{array::codec::zfp::ZfpMode, MetadataV3}, @@ -129,7 +129,7 @@ impl ZfpCodec { } impl CodecTraits for ZfpCodec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let configuration = ZfpCodecConfigurationV1 { write_header: Some(self.write_header), mode: self.mode, diff --git a/zarrs/src/array/codec/bytes_to_bytes/blosc/blosc_codec.rs b/zarrs/src/array/codec/bytes_to_bytes/blosc/blosc_codec.rs index 62972cd9..afd1bdb0 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/blosc/blosc_codec.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/blosc/blosc_codec.rs @@ -6,9 +6,10 @@ use crate::{ array::{ codec::{ BytesPartialDecoderTraits, BytesPartialEncoderDefault, BytesPartialEncoderTraits, - BytesToBytesCodecTraits, CodecError, CodecOptions, CodecTraits, RecommendedConcurrency, + BytesToBytesCodecTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, + RecommendedConcurrency, }, - ArrayMetadataOptions, BytesRepresentation, RawBytes, + BytesRepresentation, RawBytes, }, metadata::v3::MetadataV3, plugin::PluginCreateError, @@ -131,7 +132,7 @@ impl BloscCodec { } impl CodecTraits for BloscCodec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let configuration = BloscCodecConfigurationV1 { cname: self.cname, clevel: self.clevel, diff --git a/zarrs/src/array/codec/bytes_to_bytes/bz2/bz2_codec.rs b/zarrs/src/array/codec/bytes_to_bytes/bz2/bz2_codec.rs index d1380c3e..e967e681 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/bz2/bz2_codec.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/bz2/bz2_codec.rs @@ -8,9 +8,10 @@ use crate::{ array::{ codec::{ BytesPartialDecoderTraits, BytesPartialEncoderDefault, BytesPartialEncoderTraits, - BytesToBytesCodecTraits, CodecError, CodecOptions, CodecTraits, RecommendedConcurrency, + BytesToBytesCodecTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, + RecommendedConcurrency, }, - ArrayMetadataOptions, BytesRepresentation, RawBytes, + BytesRepresentation, RawBytes, }, config::global_config, metadata::v3::MetadataV3, @@ -46,7 +47,7 @@ impl Bz2Codec { } impl CodecTraits for Bz2Codec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let configuration = Bz2CodecConfigurationV1 { level: Bz2CompressionLevel::try_from(self.compression.level()) .expect("checked on init"), diff --git a/zarrs/src/array/codec/bytes_to_bytes/crc32c/crc32c_codec.rs b/zarrs/src/array/codec/bytes_to_bytes/crc32c/crc32c_codec.rs index 7e88534b..47d1cfc6 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/crc32c/crc32c_codec.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/crc32c/crc32c_codec.rs @@ -5,9 +5,10 @@ use crate::{ codec::{ bytes_to_bytes::strip_suffix_partial_decoder::StripSuffixPartialDecoder, BytesPartialDecoderTraits, BytesPartialEncoderDefault, BytesPartialEncoderTraits, - BytesToBytesCodecTraits, CodecError, CodecOptions, CodecTraits, RecommendedConcurrency, + BytesToBytesCodecTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, + RecommendedConcurrency, }, - ArrayMetadataOptions, BytesRepresentation, RawBytes, + BytesRepresentation, RawBytes, }, metadata::v3::MetadataV3, }; @@ -39,7 +40,7 @@ impl Crc32cCodec { } impl CodecTraits for Crc32cCodec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let configuration = Crc32cCodecConfigurationV1 {}; Some(MetadataV3::new_with_serializable_configuration(IDENTIFIER, &configuration).unwrap()) } diff --git a/zarrs/src/array/codec/bytes_to_bytes/fletcher32/fletcher32_codec.rs b/zarrs/src/array/codec/bytes_to_bytes/fletcher32/fletcher32_codec.rs index 04e319f6..0944ea56 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/fletcher32/fletcher32_codec.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/fletcher32/fletcher32_codec.rs @@ -7,9 +7,10 @@ use crate::{ codec::{ bytes_to_bytes::strip_suffix_partial_decoder::StripSuffixPartialDecoder, BytesPartialDecoderTraits, BytesPartialEncoderDefault, BytesPartialEncoderTraits, - BytesToBytesCodecTraits, CodecError, CodecOptions, CodecTraits, RecommendedConcurrency, + BytesToBytesCodecTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, + RecommendedConcurrency, }, - ArrayMetadataOptions, BytesRepresentation, RawBytes, + BytesRepresentation, RawBytes, }, metadata::v3::MetadataV3, }; @@ -43,7 +44,7 @@ impl Fletcher32Codec { } impl CodecTraits for Fletcher32Codec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let configuration = Fletcher32CodecConfigurationV1 {}; Some(MetadataV3::new_with_serializable_configuration(IDENTIFIER, &configuration).unwrap()) } diff --git a/zarrs/src/array/codec/bytes_to_bytes/gdeflate/gdeflate_codec.rs b/zarrs/src/array/codec/bytes_to_bytes/gdeflate/gdeflate_codec.rs index 93ac2590..8d969abd 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/gdeflate/gdeflate_codec.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/gdeflate/gdeflate_codec.rs @@ -4,9 +4,9 @@ use crate::{ array::{ codec::{ BytesPartialDecoderTraits, BytesPartialEncoderDefault, BytesPartialEncoderTraits, - BytesToBytesCodecTraits, CodecError, CodecOptions, CodecTraits, + BytesToBytesCodecTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, }, - ArrayMetadataOptions, BytesRepresentation, RawBytes, RecommendedConcurrency, + BytesRepresentation, RawBytes, RecommendedConcurrency, }, metadata::v3::MetadataV3, }; @@ -47,7 +47,7 @@ impl GDeflateCodec { } impl CodecTraits for GDeflateCodec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let configuration = GDeflateCodecConfigurationV1 { level: self.compression_level, }; diff --git a/zarrs/src/array/codec/bytes_to_bytes/gzip/gzip_codec.rs b/zarrs/src/array/codec/bytes_to_bytes/gzip/gzip_codec.rs index af57ada4..aa249c68 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/gzip/gzip_codec.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/gzip/gzip_codec.rs @@ -10,9 +10,10 @@ use crate::{ array::{ codec::{ BytesPartialDecoderTraits, BytesPartialEncoderDefault, BytesPartialEncoderTraits, - BytesToBytesCodecTraits, CodecError, CodecOptions, CodecTraits, RecommendedConcurrency, + BytesToBytesCodecTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, + RecommendedConcurrency, }, - ArrayMetadataOptions, BytesRepresentation, RawBytes, + BytesRepresentation, RawBytes, }, metadata::v3::MetadataV3, }; @@ -52,7 +53,7 @@ impl GzipCodec { } impl CodecTraits for GzipCodec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let configuration = GzipCodecConfigurationV1 { level: self.compression_level, }; diff --git a/zarrs/src/array/codec/bytes_to_bytes/test_unbounded/test_unbounded_codec.rs b/zarrs/src/array/codec/bytes_to_bytes/test_unbounded/test_unbounded_codec.rs index ee419d23..4166a6ca 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/test_unbounded/test_unbounded_codec.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/test_unbounded/test_unbounded_codec.rs @@ -4,9 +4,10 @@ use crate::{ array::{ codec::{ BytesPartialDecoderTraits, BytesPartialEncoderDefault, BytesPartialEncoderTraits, - BytesToBytesCodecTraits, CodecError, CodecOptions, CodecTraits, RecommendedConcurrency, + BytesToBytesCodecTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, + RecommendedConcurrency, }, - ArrayMetadataOptions, BytesRepresentation, RawBytes, + BytesRepresentation, RawBytes, }, metadata::v3::MetadataV3, }; @@ -31,7 +32,7 @@ impl TestUnboundedCodec { } impl CodecTraits for TestUnboundedCodec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { None } diff --git a/zarrs/src/array/codec/bytes_to_bytes/zstd/zstd_codec.rs b/zarrs/src/array/codec/bytes_to_bytes/zstd/zstd_codec.rs index 185b060f..03138c63 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/zstd/zstd_codec.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/zstd/zstd_codec.rs @@ -6,9 +6,10 @@ use crate::{ array::{ codec::{ BytesPartialDecoderTraits, BytesPartialEncoderDefault, BytesPartialEncoderTraits, - BytesToBytesCodecTraits, CodecError, CodecOptions, CodecTraits, RecommendedConcurrency, + BytesToBytesCodecTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, + RecommendedConcurrency, }, - ArrayMetadataOptions, BytesRepresentation, RawBytes, + BytesRepresentation, RawBytes, }, metadata::v3::MetadataV3, }; @@ -47,7 +48,7 @@ impl ZstdCodec { } impl CodecTraits for ZstdCodec { - fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { + fn create_metadata_opt(&self, _options: &CodecMetadataOptions) -> Option { let configuration = ZstdCodecConfigurationV1 { level: self.compression.into(), checksum: self.checksum, diff --git a/zarrs/src/array/codec/metadata_options.rs b/zarrs/src/array/codec/metadata_options.rs new file mode 100644 index 00000000..7a0a94c2 --- /dev/null +++ b/zarrs/src/array/codec/metadata_options.rs @@ -0,0 +1,39 @@ +//! Codec metadata options. + +/// Options for codec metadata. +#[derive(Debug, Clone, Default)] +pub struct CodecMetadataOptions { + experimental_codec_store_metadata_if_encode_only: bool, +} + +// impl Default for CodecMetadataOptions { +// fn default() -> Self { +// Self { +// experimental_codec_store_metadata_if_encode_only: false, +// } +// } +// } + +impl CodecMetadataOptions { + /// Return the [experimental codec store metadata if encode only](crate::config::Config#experimental-codec-store-metadata-if-encode-only) setting. + #[must_use] + pub fn experimental_codec_store_metadata_if_encode_only(&self) -> bool { + self.experimental_codec_store_metadata_if_encode_only + } + + /// Set the [experimental codec store metadata if encode only](crate::config::Config#experimental-codec-store-metadata-if-encode-only) setting. + #[must_use] + pub fn with_experimental_codec_store_metadata_if_encode_only(mut self, enabled: bool) -> Self { + self.experimental_codec_store_metadata_if_encode_only = enabled; + self + } + + /// Set the [experimental codec store metadata if encode only](crate::config::Config#experimental-codec-store-metadata-if-encode-only) setting. + pub fn set_experimental_codec_store_metadata_if_encode_only( + &mut self, + enabled: bool, + ) -> &mut Self { + self.experimental_codec_store_metadata_if_encode_only = enabled; + self + } +} From beecd3003915ffa7ea0454a24448ca31c28650bf Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Sun, 19 Jan 2025 16:10:42 +1100 Subject: [PATCH 20/45] refactor!: change `ArraySubset::inbounds` to take another subset rather than a shape (#134) --- CHANGELOG.md | 6 ++++ zarrs/src/array/array_async_readable.rs | 4 +-- zarrs/src/array/array_sync_readable.rs | 4 +-- .../chunk_cache/array_chunk_cache_ext_sync.rs | 2 +- zarrs/src/array/codec.rs | 6 ++-- zarrs/src/array_subset.rs | 32 ++++++++++++++++--- 6 files changed, 41 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2db1ecfd..39a84d5e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- Add `ArraySubset::inbounds_shape()` (matches the old `ArraySubset::inbounds` behaviour) + +### Changed +- **Breaking**: change `ArraySubset::inbounds` to take another subset rather than a shape + ## [0.19.1] - 2025-01-19 ### Added diff --git a/zarrs/src/array/array_async_readable.rs b/zarrs/src/array/array_async_readable.rs index 79ca0f72..8139b356 100644 --- a/zarrs/src/array/array_async_readable.rs +++ b/zarrs/src/array/array_async_readable.rs @@ -737,7 +737,7 @@ impl Array { options: &CodecOptions, ) -> Result, ArrayError> { let chunk_representation = self.chunk_array_representation(chunk_indices)?; - if !chunk_subset.inbounds(&chunk_representation.shape_u64()) { + if !chunk_subset.inbounds_shape(&chunk_representation.shape_u64()) { return Err(ArrayError::InvalidArraySubset( chunk_subset.clone(), self.shape().to_vec(), @@ -783,7 +783,7 @@ impl Array { options: &CodecOptions, ) -> Result<(), ArrayError> { let chunk_representation = self.chunk_array_representation(chunk_indices)?; - if !chunk_subset.inbounds(&chunk_representation.shape_u64()) { + if !chunk_subset.inbounds_shape(&chunk_representation.shape_u64()) { return Err(ArrayError::InvalidArraySubset( chunk_subset.clone(), self.shape().to_vec(), diff --git a/zarrs/src/array/array_sync_readable.rs b/zarrs/src/array/array_sync_readable.rs index 200e0ba6..a2b8d4b2 100644 --- a/zarrs/src/array/array_sync_readable.rs +++ b/zarrs/src/array/array_sync_readable.rs @@ -794,7 +794,7 @@ impl Array { options: &CodecOptions, ) -> Result, ArrayError> { let chunk_representation = self.chunk_array_representation(chunk_indices)?; - if !chunk_subset.inbounds(&chunk_representation.shape_u64()) { + if !chunk_subset.inbounds_shape(&chunk_representation.shape_u64()) { return Err(ArrayError::InvalidArraySubset( chunk_subset.clone(), self.shape().to_vec(), @@ -837,7 +837,7 @@ impl Array { options: &CodecOptions, ) -> Result<(), ArrayError> { let chunk_representation = self.chunk_array_representation(chunk_indices)?; - if !chunk_subset.inbounds(&chunk_representation.shape_u64()) { + if !chunk_subset.inbounds_shape(&chunk_representation.shape_u64()) { return Err(ArrayError::InvalidArraySubset( chunk_subset.clone(), self.shape().to_vec(), diff --git a/zarrs/src/array/chunk_cache/array_chunk_cache_ext_sync.rs b/zarrs/src/array/chunk_cache/array_chunk_cache_ext_sync.rs index db3f3fee..03cd7036 100644 --- a/zarrs/src/array/chunk_cache/array_chunk_cache_ext_sync.rs +++ b/zarrs/src/array/chunk_cache/array_chunk_cache_ext_sync.rs @@ -229,7 +229,7 @@ impl ArrayChunkCacheExt Result, ArrayError> { let chunk_representation = self.chunk_array_representation(chunk_indices)?; - if !chunk_subset.inbounds(&chunk_representation.shape_u64()) { + if !chunk_subset.inbounds_shape(&chunk_representation.shape_u64()) { return Err(ArrayError::InvalidArraySubset( chunk_subset.clone(), self.shape().to_vec(), diff --git a/zarrs/src/array/codec.rs b/zarrs/src/array/codec.rs index f79facb0..e911ec70 100644 --- a/zarrs/src/array/codec.rs +++ b/zarrs/src/array/codec.rs @@ -377,7 +377,7 @@ pub trait ArrayPartialDecoderTraits: Any + Send + Sync { output_subset: &ArraySubset, options: &CodecOptions, ) -> Result<(), CodecError> { - debug_assert!(output_subset.inbounds(output_shape)); + debug_assert!(output_subset.inbounds_shape(output_shape)); debug_assert_eq!(array_subset.num_elements(), output_subset.num_elements()); let decoded_value = self .partial_decode(&[array_subset.clone()], options)? @@ -462,7 +462,7 @@ pub trait AsyncArrayPartialDecoderTraits: Any + Send + Sync { output_subset: &ArraySubset, options: &CodecOptions, ) -> Result<(), CodecError> { - debug_assert!(output_subset.inbounds(output_shape)); + debug_assert!(output_subset.inbounds_shape(output_shape)); debug_assert_eq!(array_subset.shape(), output_subset.shape()); let decoded_value = self .partial_decode(&[array_subset.clone()], options) @@ -730,7 +730,7 @@ pub trait ArrayToBytesCodecTraits: ArrayCodecTraits + core::fmt::Debug { output_subset: &ArraySubset, options: &CodecOptions, ) -> Result<(), CodecError> { - debug_assert!(output_subset.inbounds(output_shape)); + debug_assert!(output_subset.inbounds_shape(output_shape)); debug_assert_eq!( decoded_representation.num_elements(), output_subset.num_elements() diff --git a/zarrs/src/array_subset.rs b/zarrs/src/array_subset.rs index ac7b8cde..ef8846e0 100644 --- a/zarrs/src/array_subset.rs +++ b/zarrs/src/array_subset.rs @@ -564,9 +564,26 @@ impl ArraySubset { } } - /// Returns true if the array subset is within the bounds of `array_shape`. + /// Returns true if this array subset is within the bounds of `subset`. #[must_use] - pub fn inbounds(&self, array_shape: &[u64]) -> bool { + pub fn inbounds(&self, subset: &ArraySubset) -> bool { + if self.dimensionality() != subset.dimensionality() { + return false; + } + + for (self_start, self_shape, other_start, other_shape) in + izip!(self.start(), self.shape(), subset.start(), subset.shape()) + { + if self_start < other_start || self_start + self_shape > other_start + other_shape { + return false; + } + } + true + } + + /// Returns true if the array subset is within the bounds of an `ArraySubset` with zero origin and a shape of `array_shape`. + #[must_use] + pub fn inbounds_shape(&self, array_shape: &[u64]) -> bool { if self.dimensionality() != array_shape.len() { return false; } @@ -646,9 +663,14 @@ mod tests { ArraySubset::new_with_ranges(&[0..4, 1..5]) ); assert!(array_subset0.relative_to(&[1, 1, 1]).is_err()); - assert!(array_subset0.inbounds(&[10, 10])); - assert!(!array_subset0.inbounds(&[2, 2])); - assert!(!array_subset0.inbounds(&[10, 10, 10])); + assert!(array_subset0.inbounds_shape(&[10, 10])); + assert!(!array_subset0.inbounds_shape(&[2, 2])); + assert!(!array_subset0.inbounds_shape(&[10, 10, 10])); + assert!(array_subset0.inbounds(&ArraySubset::new_with_ranges(&[0..6, 1..7]))); + assert!(array_subset0.inbounds(&ArraySubset::new_with_ranges(&[1..5, 2..6]))); + assert!(!array_subset0.inbounds(&ArraySubset::new_with_ranges(&[2..5, 2..6]))); + assert!(!array_subset0.inbounds(&ArraySubset::new_with_ranges(&[1..5, 2..5]))); + assert!(!array_subset0.inbounds(&ArraySubset::new_with_ranges(&[2..5]))); assert_eq!(array_subset0.to_ranges(), vec![1..5, 2..6]); let array_subset2 = ArraySubset::new_with_ranges(&[3..6, 4..7, 0..1]); From f675f06130b39edf2488d560316704b334d823b9 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Sun, 19 Jan 2025 16:22:48 +1100 Subject: [PATCH 21/45] refactor!: add new types of `CodecError`s (#135) --- CHANGELOG.md | 4 + zarrs/src/array.rs | 5 +- zarrs/src/array/array_bytes.rs | 21 ++--- zarrs/src/array/codec.rs | 85 ++++++++++++++++++- .../codec/array_to_bytes/bytes/bytes_codec.rs | 13 ++- zarrs/src/array/codec/array_to_bytes/vlen.rs | 7 +- .../src/array/codec/array_to_bytes/vlen_v2.rs | 10 +-- .../array/codec/bytes_to_bytes/gdeflate.rs | 16 ++-- 8 files changed, 121 insertions(+), 40 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 39a84d5e..4bed6036 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,10 +8,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- Add `array:codec::{InvalidBytesLengthError,InvalidArrayShapeError,InvalidNumberOfElementsError,SubsetOutOfBoundsError}` - Add `ArraySubset::inbounds_shape()` (matches the old `ArraySubset::inbounds` behaviour) ### Changed - **Breaking**: change `ArraySubset::inbounds` to take another subset rather than a shape +- **Breaking**: `CodecError` enum changes: + - Change `CodecError::UnexpectedChunkDecodedSize` to an `InvalidBytesLengthError` + - Add `CodecError::{InvalidArrayShape,InvalidNumberOfElements,SubsetOutOfBounds}` ## [0.19.1] - 2025-01-19 diff --git a/zarrs/src/array.rs b/zarrs/src/array.rs index e810b4dd..4d595709 100644 --- a/zarrs/src/array.rs +++ b/zarrs/src/array.rs @@ -910,10 +910,7 @@ pub fn elements_to_ndarray( ) -> Result, ArrayError> { let length = elements.len(); ndarray::ArrayD::::from_shape_vec(iter_u64_to_usize(shape.iter()), elements).map_err(|_| { - ArrayError::CodecError(codec::CodecError::UnexpectedChunkDecodedSize( - length * size_of::(), - shape.iter().product::() * size_of::() as u64, - )) + ArrayError::CodecError(codec::InvalidArrayShapeError::new(shape.to_vec(), length).into()) }) } diff --git a/zarrs/src/array/array_bytes.rs b/zarrs/src/array/array_bytes.rs index 53c4e7b5..eabe18de 100644 --- a/zarrs/src/array/array_bytes.rs +++ b/zarrs/src/array/array_bytes.rs @@ -10,7 +10,10 @@ use crate::{ metadata::v3::array::data_type::DataTypeSize, }; -use super::{codec::CodecError, ravel_indices, ArraySize, DataType, FillValue}; +use super::{ + codec::{CodecError, InvalidBytesLengthError}, + ravel_indices, ArraySize, DataType, FillValue, +}; /// Array element bytes. /// @@ -217,14 +220,11 @@ impl<'a> ArrayBytes<'a> { } /// Validate fixed length array bytes for a given array size. -fn validate_bytes_flen(bytes: &RawBytes, array_size: u64) -> Result<(), CodecError> { - if bytes.len() as u64 == array_size { +fn validate_bytes_flen(bytes: &RawBytes, array_size: usize) -> Result<(), InvalidBytesLengthError> { + if bytes.len() == array_size { Ok(()) } else { - Err(CodecError::UnexpectedChunkDecodedSize( - bytes.len(), - array_size, - )) + Err(InvalidBytesLengthError::new(bytes.len(), array_size)) } } @@ -259,9 +259,10 @@ fn validate_bytes( data_type_size: DataTypeSize, ) -> Result<(), CodecError> { match (bytes, data_type_size) { - (ArrayBytes::Fixed(bytes), DataTypeSize::Fixed(data_type_size)) => { - validate_bytes_flen(bytes, num_elements * data_type_size as u64) - } + (ArrayBytes::Fixed(bytes), DataTypeSize::Fixed(data_type_size)) => Ok(validate_bytes_flen( + bytes, + usize::try_from(num_elements * data_type_size as u64).unwrap(), + )?), (ArrayBytes::Variable(bytes, offsets), DataTypeSize::Variable) => { validate_bytes_vlen(bytes, offsets, num_elements) } diff --git a/zarrs/src/array/codec.rs b/zarrs/src/array/codec.rs index e911ec70..1eff5539 100644 --- a/zarrs/src/array/codec.rs +++ b/zarrs/src/array/codec.rs @@ -16,6 +16,7 @@ pub mod bytes_to_bytes; pub mod metadata_options; pub mod options; +use derive_more::derive::Display; pub use metadata_options::CodecMetadataOptions; pub use options::{CodecOptions, CodecOptionsBuilder}; @@ -79,6 +80,7 @@ pub use array_to_array_partial_encoder_default::ArrayToArrayPartialEncoderDefaul mod bytes_partial_encoder_default; pub use bytes_partial_encoder_default::BytesPartialEncoderDefault; +use zarrs_metadata::ArrayShape; use crate::storage::{StoreKeyOffsetValue, WritableStorage}; use crate::{ @@ -964,6 +966,76 @@ impl AsyncBytesPartialDecoderTraits for std::io::Cursor> { } } +/// An error indicating the length of bytes does not match the expected length. +#[derive(Debug, Error, Display)] +#[display("Invalid bytes len {len}, expected {expected_len}")] +pub struct InvalidBytesLengthError { + len: usize, + expected_len: usize, +} + +impl InvalidBytesLengthError { + /// Create a new [`InvalidBytesLengthError`]. + #[must_use] + pub fn new(len: usize, expected_len: usize) -> Self { + Self { len, expected_len } + } +} + +/// An error indicating the shape is not compatible with the expected number of elements. +#[derive(Debug, Error, Display)] +#[display("Invalid shape {shape:?} for number of elements {expected_num_elements}")] +pub struct InvalidArrayShapeError { + shape: ArrayShape, + expected_num_elements: usize, +} + +impl InvalidArrayShapeError { + /// Create a new [`InvalidArrayShapeError`]. + #[must_use] + pub fn new(shape: ArrayShape, expected_num_elements: usize) -> Self { + Self { + shape, + expected_num_elements, + } + } +} + +/// An error indicating the length of elements does not match the expected length. +#[derive(Debug, Error, Display)] +#[display("Invalid number of elements {num}, expected {expected}")] +pub struct InvalidNumberOfElementsError { + num: u64, + expected: u64, +} + +impl InvalidNumberOfElementsError { + /// Create a new [`InvalidNumberOfElementsError`]. + #[must_use] + pub fn new(num: u64, expected: u64) -> Self { + Self { num, expected } + } +} + +/// An array subset is out of bounds. +#[derive(Debug, Error, Display)] +#[display("Subset {subset} is out of bounds of {must_be_within}")] +pub struct SubsetOutOfBoundsError { + subset: ArraySubset, + must_be_within: ArraySubset, +} + +impl SubsetOutOfBoundsError { + /// Create a new [`InvalidNumberOfElementsError`]. + #[must_use] + pub fn new(subset: ArraySubset, must_be_within: ArraySubset) -> Self { + Self { + subset, + must_be_within, + } + } +} + /// A codec error. #[derive(Debug, Error)] pub enum CodecError { @@ -980,8 +1052,8 @@ pub enum CodecError { #[error("the array subset {_0} has the wrong dimensionality, expected {_1}")] InvalidArraySubsetDimensionalityError(ArraySubset, usize), /// The decoded size of a chunk did not match what was expected. - #[error("the size of a decoded chunk is {_0}, expected {_1}")] - UnexpectedChunkDecodedSize(usize, u64), + #[error("the size of a decoded chunk is {}, expected {}", _0.len, _0.expected_len)] + UnexpectedChunkDecodedSize(#[from] InvalidBytesLengthError), /// An embedded checksum does not match the decoded value. #[error("the checksum is invalid")] InvalidChecksum, @@ -1006,6 +1078,15 @@ pub enum CodecError { /// Expected variable length bytes. #[error("Expected variable length array bytes")] ExpectedVariableLengthBytes, + /// Invalid array shape. + #[error(transparent)] + InvalidArrayShape(#[from] InvalidArrayShapeError), + /// Invalid number of elements. + #[error(transparent)] + InvalidNumberOfElements(#[from] InvalidNumberOfElementsError), + /// Subset out of bounds. + #[error(transparent)] + SubsetOutOfBounds(#[from] SubsetOutOfBoundsError), } impl From<&str> for CodecError { diff --git a/zarrs/src/array/codec/array_to_bytes/bytes/bytes_codec.rs b/zarrs/src/array/codec/array_to_bytes/bytes/bytes_codec.rs index ce379a00..a199d326 100644 --- a/zarrs/src/array/codec/array_to_bytes/bytes/bytes_codec.rs +++ b/zarrs/src/array/codec/array_to_bytes/bytes/bytes_codec.rs @@ -8,7 +8,7 @@ use crate::{ ArrayCodecTraits, ArrayPartialDecoderTraits, ArrayPartialEncoderDefault, ArrayPartialEncoderTraits, ArrayToBytesCodecTraits, BytesPartialDecoderTraits, BytesPartialEncoderTraits, CodecError, CodecMetadataOptions, CodecOptions, CodecTraits, - RecommendedConcurrency, + InvalidBytesLengthError, RecommendedConcurrency, }, ArrayBytes, BytesRepresentation, ChunkRepresentation, DataTypeSize, RawBytes, }, @@ -76,12 +76,11 @@ impl BytesCodec { )); } DataTypeSize::Fixed(data_type_size) => { - let array_size = decoded_representation.num_elements() * data_type_size as u64; - if value.len() as u64 != array_size { - return Err(CodecError::UnexpectedChunkDecodedSize( - value.len(), - array_size, - )); + let array_size = + usize::try_from(decoded_representation.num_elements() * data_type_size as u64) + .unwrap(); + if value.len() != array_size { + return Err(InvalidBytesLengthError::new(value.len(), array_size).into()); } else if data_type_size > 1 && self.endian.is_none() { return Err(CodecError::Other(format!( "tried to encode an array with element size {data_type_size} with endianness None" diff --git a/zarrs/src/array/codec/array_to_bytes/vlen.rs b/zarrs/src/array/codec/array_to_bytes/vlen.rs index 5873fe7a..b246c08e 100644 --- a/zarrs/src/array/codec/array_to_bytes/vlen.rs +++ b/zarrs/src/array/codec/array_to_bytes/vlen.rs @@ -13,7 +13,7 @@ pub use crate::metadata::v3::array::codec::vlen::{ }; use crate::{ array::{ - codec::{ArrayToBytesCodecTraits, CodecError, CodecOptions}, + codec::{ArrayToBytesCodecTraits, CodecError, CodecOptions, InvalidBytesLengthError}, convert_from_bytes_slice, ChunkRepresentation, CodecChain, DataType, Endianness, FillValue, RawBytes, }, @@ -62,10 +62,7 @@ fn get_vlen_bytes_and_offsets( ) -> Result<(Vec, Vec), CodecError> { // Get the index length and data start if bytes.len() < size_of::() { - return Err(CodecError::UnexpectedChunkDecodedSize( - bytes.len(), - size_of::() as u64, - )); + return Err(InvalidBytesLengthError::new(bytes.len(), size_of::()).into()); } let index_len = u64::from_le_bytes(bytes[0..size_of::()].try_into().unwrap()); let index_len = usize::try_from(index_len) diff --git a/zarrs/src/array/codec/array_to_bytes/vlen_v2.rs b/zarrs/src/array/codec/array_to_bytes/vlen_v2.rs index 856d7566..e22a0d28 100644 --- a/zarrs/src/array/codec/array_to_bytes/vlen_v2.rs +++ b/zarrs/src/array/codec/array_to_bytes/vlen_v2.rs @@ -11,7 +11,10 @@ use std::sync::Arc; pub(crate) const IDENTIFIER: &str = "vlen_v2"; // pub use vlen_v2::IDENTIFIER; -use crate::array::{codec::CodecError, RawBytes}; +use crate::array::{ + codec::{CodecError, InvalidBytesLengthError}, + RawBytes, +}; pub(crate) use vlen_v2_codec::VlenV2Codec; @@ -67,10 +70,7 @@ fn get_interleaved_bytes_and_offsets( // Validate the bytes is long enough to contain header and element lengths let header_length = size_of::() * (1 + num_elements); if bytes.len() < header_length { - return Err(CodecError::UnexpectedChunkDecodedSize( - bytes.len(), - header_length as u64, - )); + return Err(InvalidBytesLengthError::new(bytes.len(), header_length).into()); } // Validate the number of elements from the header diff --git a/zarrs/src/array/codec/bytes_to_bytes/gdeflate.rs b/zarrs/src/array/codec/bytes_to_bytes/gdeflate.rs index ac74783e..13ade3f2 100644 --- a/zarrs/src/array/codec/bytes_to_bytes/gdeflate.rs +++ b/zarrs/src/array/codec/bytes_to_bytes/gdeflate.rs @@ -28,7 +28,7 @@ pub use gdeflate_codec::GDeflateCodec; use crate::{ array::{ - codec::{Codec, CodecError, CodecPlugin}, + codec::{Codec, CodecError, CodecPlugin, InvalidBytesLengthError}, RawBytes, }, metadata::v3::{array::codec::gdeflate, MetadataV3}, @@ -61,10 +61,11 @@ const GDEFLATE_STATIC_HEADER_LENGTH: usize = 2 * size_of::(); fn gdeflate_decode(encoded_value: &RawBytes<'_>) -> Result, CodecError> { if encoded_value.len() < GDEFLATE_STATIC_HEADER_LENGTH { - return Err(CodecError::UnexpectedChunkDecodedSize( + return Err(InvalidBytesLengthError::new( encoded_value.len(), - GDEFLATE_STATIC_HEADER_LENGTH as u64, - )); + GDEFLATE_STATIC_HEADER_LENGTH, + ) + .into()); } // Decode the static header @@ -77,10 +78,11 @@ fn gdeflate_decode(encoded_value: &RawBytes<'_>) -> Result, CodecError> // Check length of dynamic header let dynamic_header_length = num_pages * size_of::(); if encoded_value.len() < GDEFLATE_STATIC_HEADER_LENGTH + dynamic_header_length { - return Err(CodecError::UnexpectedChunkDecodedSize( + return Err(InvalidBytesLengthError::new( encoded_value.len(), - (GDEFLATE_STATIC_HEADER_LENGTH + dynamic_header_length) as u64, - )); + GDEFLATE_STATIC_HEADER_LENGTH + dynamic_header_length, + ) + .into()); } // Decode the pages From 091019bd5c7b5202d108e1bfdc21421ff8009656 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Wed, 22 Jan 2025 08:25:29 +1100 Subject: [PATCH 22/45] refactor!: retrieve and decode `_into` APIs (#136) --- CHANGELOG.md | 7 + zarrs/src/array.rs | 4 + zarrs/src/array/array_async_readable.rs | 98 ++--- .../array/array_async_readable_writable.rs | 16 +- zarrs/src/array/array_bytes.rs | 206 ++++------ .../array/array_bytes_fixed_disjoint_view.rs | 354 ++++++++++++++++++ zarrs/src/array/array_sync_readable.rs | 88 ++--- .../src/array/array_sync_readable_writable.rs | 16 +- .../array/array_sync_sharded_readable_ext.rs | 23 +- .../chunk_cache/array_chunk_cache_ext_sync.rs | 26 +- zarrs/src/array/chunk_grid.rs | 3 + zarrs/src/array/codec.rs | 95 ++--- .../codec/array_partial_encoder_default.rs | 16 +- .../array_to_array_partial_encoder_default.rs | 16 +- .../array/codec/array_to_bytes/codec_chain.rs | 53 +-- .../array_to_bytes/sharding/sharding_codec.rs | 220 ++++------- .../sharding/sharding_partial_decoder.rs | 85 +++-- .../sharding/sharding_partial_encoder.rs | 20 +- 18 files changed, 749 insertions(+), 597 deletions(-) create mode 100644 zarrs/src/array/array_bytes_fixed_disjoint_view.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 4bed6036..189c092b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,12 +10,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Add `array:codec::{InvalidBytesLengthError,InvalidArrayShapeError,InvalidNumberOfElementsError,SubsetOutOfBoundsError}` - Add `ArraySubset::inbounds_shape()` (matches the old `ArraySubset::inbounds` behaviour) +- Add `ArrayBytesFixedDisjointView[CreateError]` ### Changed - **Breaking**: change `ArraySubset::inbounds` to take another subset rather than a shape - **Breaking**: `CodecError` enum changes: - Change `CodecError::UnexpectedChunkDecodedSize` to an `InvalidBytesLengthError` - Add `CodecError::{InvalidArrayShape,InvalidNumberOfElements,SubsetOutOfBounds}` +- **Breaking**: Change output args to `ArrayBytesFixedDisjointView` and make safe the following: + - `Array::[async_]retrieve_chunk[_subset]_into` + - `[Async]ArrayPartialDecoderTraits::partial_decode_into` + - `ArrayToBytesCodecTraits::decode_into` + - `zarrs::array::copy_fill_value_into` + - `zarrs::array::update_array_bytes` ## [0.19.1] - 2025-01-19 diff --git a/zarrs/src/array.rs b/zarrs/src/array.rs index 4d595709..7f1800e1 100644 --- a/zarrs/src/array.rs +++ b/zarrs/src/array.rs @@ -23,6 +23,7 @@ mod array_builder; mod array_bytes; +mod array_bytes_fixed_disjoint_view; mod array_errors; mod array_metadata_options; mod array_representation; @@ -50,6 +51,9 @@ pub use self::{ copy_fill_value_into, update_array_bytes, ArrayBytes, ArrayBytesError, RawBytes, RawBytesOffsets, }, + array_bytes_fixed_disjoint_view::{ + ArrayBytesFixedDisjointView, ArrayBytesFixedDisjointViewCreateError, + }, array_errors::{ArrayCreateError, ArrayError}, array_metadata_options::ArrayMetadataOptions, array_representation::{ diff --git a/zarrs/src/array/array_async_readable.rs b/zarrs/src/array/array_async_readable.rs index 8139b356..f418968e 100644 --- a/zarrs/src/array/array_async_readable.rs +++ b/zarrs/src/array/array_async_readable.rs @@ -18,8 +18,8 @@ use super::{ }, concurrency::concurrency_chunks_and_codec, element::ElementOwned, - Array, ArrayBytes, ArrayCreateError, ArrayError, ArrayMetadata, ArrayMetadataV2, - ArrayMetadataV3, ArraySize, DataTypeSize, + Array, ArrayBytes, ArrayBytesFixedDisjointView, ArrayCreateError, ArrayError, ArrayMetadata, + ArrayMetadataV2, ArrayMetadataV3, ArraySize, DataTypeSize, }; #[cfg(feature = "ndarray")] @@ -335,12 +335,10 @@ impl Array { } /// Async variant of [`retrieve_chunk_into`](Array::retrieve_chunk_into). - async unsafe fn async_retrieve_chunk_into( + async fn async_retrieve_chunk_into( &self, chunk_indices: &[u64], - output: &UnsafeCellSlice<'_, u8>, - output_shape: &[u64], - output_subset: &ArraySubset, + output_view: &mut ArrayBytesFixedDisjointView<'_>, options: &CodecOptions, ) -> Result<(), ArrayError> { if chunk_indices.len() != self.dimensionality() { @@ -360,29 +358,17 @@ impl Array { if let Some(chunk_encoded) = chunk_encoded { let chunk_encoded: Vec = chunk_encoded.into(); let chunk_representation = self.chunk_array_representation(chunk_indices)?; - unsafe { - self.codecs() - .decode_into( - Cow::Owned(chunk_encoded), - &chunk_representation, - output, - output_shape, - output_subset, - options, - ) - .map_err(ArrayError::CodecError) - } - } else { - unsafe { - copy_fill_value_into( - self.data_type(), - self.fill_value(), - output, - output_shape, - output_subset, + self.codecs() + .decode_into( + Cow::Owned(chunk_encoded), + &chunk_representation, + output_view, + options, ) .map_err(ArrayError::CodecError) - } + } else { + copy_fill_value_into(self.data_type(), self.fill_value(), output_view) + .map_err(ArrayError::CodecError) } } @@ -650,19 +636,25 @@ impl Array { let chunk_subset = self.chunk_subset(&chunk_indices)?; let chunk_subset_overlap = chunk_subset.overlap(array_subset)?; - unsafe { - self.async_retrieve_chunk_subset_into( - &chunk_indices, - &chunk_subset_overlap - .relative_to(chunk_subset.start())?, - &output, + + let mut output_view = unsafe { + // SAFETY: chunks represent disjoint array subsets + ArrayBytesFixedDisjointView::new_unchecked( + output, + data_type_size, array_subset.shape(), - &chunk_subset_overlap - .relative_to(array_subset.start())?, - &options, + chunk_subset_overlap + .relative_to(array_subset.start()) + .unwrap(), ) - .await?; - } + }; + self.async_retrieve_chunk_subset_into( + &chunk_indices, + &chunk_subset_overlap.relative_to(chunk_subset.start())?, + &mut output_view, + &options, + ) + .await?; // let chunk_subset_bytes = self // .async_retrieve_chunk_subset_opt( // &chunk_indices, @@ -773,13 +765,11 @@ impl Array { Ok(bytes) } - async unsafe fn async_retrieve_chunk_subset_into( + async fn async_retrieve_chunk_subset_into( &self, chunk_indices: &[u64], chunk_subset: &ArraySubset, - output: &UnsafeCellSlice<'_, u8>, - output_shape: &[u64], - output_subset: &ArraySubset, + output_view: &mut ArrayBytesFixedDisjointView<'_>, options: &CodecOptions, ) -> Result<(), ArrayError> { let chunk_representation = self.chunk_array_representation(chunk_indices)?; @@ -794,16 +784,8 @@ impl Array { && chunk_subset.shape() == chunk_representation.shape_u64() { // Fast path if `chunk_subset` encompasses the whole chunk - unsafe { - self.async_retrieve_chunk_into( - chunk_indices, - output, - output_shape, - output_subset, - options, - ) + self.async_retrieve_chunk_into(chunk_indices, output_view, options) .await - } } else { let storage_handle = Arc::new(StorageHandle::new(self.storage.clone())); let storage_transformer = self @@ -815,14 +797,12 @@ impl Array { self.chunk_key(chunk_indices), )); - unsafe { - self.codecs - .clone() - .async_partial_decoder(input_handle, &chunk_representation, options) - .await? - .partial_decode_into(chunk_subset, output, output_shape, output_subset, options) - .await?; - } + self.codecs + .clone() + .async_partial_decoder(input_handle, &chunk_representation, options) + .await? + .partial_decode_into(chunk_subset, output_view, options) + .await?; Ok(()) } } diff --git a/zarrs/src/array/array_async_readable_writable.rs b/zarrs/src/array/array_async_readable_writable.rs index 4a958ede..74a3ceaa 100644 --- a/zarrs/src/array/array_async_readable_writable.rs +++ b/zarrs/src/array/array_async_readable_writable.rs @@ -158,15 +158,13 @@ impl Array, - output_shape: &[u64], - subset_bytes: &RawBytes, - subset: &ArraySubset, - data_type_size: usize, -) { - debug_assert_eq!( - output_bytes.len(), - usize::try_from(output_shape.iter().product::()).unwrap() * data_type_size - ); - debug_assert_eq!( - subset_bytes.len(), - subset.num_elements_usize() * data_type_size, - ); - - let contiguous_indices = - unsafe { subset.contiguous_linearised_indices_unchecked(output_shape) }; - let length = contiguous_indices.contiguous_elements_usize() * data_type_size; - let mut decoded_offset = 0; - // TODO: Par iteration? - for array_subset_element_index in &contiguous_indices { - let output_offset = usize::try_from(array_subset_element_index).unwrap() * data_type_size; - debug_assert!((output_offset + length) <= output_bytes.len()); - debug_assert!((decoded_offset + length) <= subset_bytes.len()); - unsafe { - output_bytes - .index_mut(output_offset..output_offset + length) - .copy_from_slice(&subset_bytes[decoded_offset..decoded_offset + length]); - } - decoded_offset += length; +pub(crate) fn update_bytes_vlen<'a>( + input_bytes: &RawBytes, + input_offsets: &RawBytesOffsets, + input_shape: &[u64], + update_bytes: &RawBytes, + update_offsets: &RawBytesOffsets, + update_subset: &ArraySubset, +) -> Result, IncompatibleArraySubsetAndShapeError> { + if !update_subset.inbounds_shape(input_shape) { + return Err(IncompatibleArraySubsetAndShapeError::new( + update_subset.clone(), + input_shape.to_vec(), + )); } -} -pub(crate) fn update_bytes_vlen<'a>( - output_bytes: &RawBytes, - output_offsets: &RawBytesOffsets, - output_shape: &[u64], - subset_bytes: &RawBytes, - subset_offsets: &RawBytesOffsets, - subset: &ArraySubset, -) -> ArrayBytes<'a> { // Get the current and new length of the bytes in the chunk subset - let size_subset_new = { - let chunk_subset_indices = ArraySubset::new_with_shape(subset.shape().to_vec()) - .linearised_indices(subset.shape()) - .unwrap(); - chunk_subset_indices - .iter() - .map(|index| { - let index = usize::try_from(index).unwrap(); - subset_offsets[index + 1] - subset_offsets[index] - }) - .sum::() - }; + let size_subset_new = update_offsets + .iter() + .tuple_windows() + .map(|(curr, next)| next - curr) + .sum::(); let size_subset_old = { - let chunk_indices = subset.linearised_indices(output_shape).unwrap(); + let chunk_indices = update_subset.linearised_indices(input_shape).unwrap(); chunk_indices .iter() .map(|index| { let index = usize::try_from(index).unwrap(); - output_offsets[index + 1] - output_offsets[index] + input_offsets[index + 1] - input_offsets[index] }) .sum::() }; // Populate new offsets and bytes - let mut offsets_new = Vec::with_capacity(output_offsets.len()); - let bytes_new_len = (output_bytes.len() + size_subset_new) + let mut offsets_new = Vec::with_capacity(input_offsets.len()); + let bytes_new_len = (input_bytes.len() + size_subset_new) .checked_sub(size_subset_old) .unwrap(); let mut bytes_new = Vec::with_capacity(bytes_new_len); - let indices = ArraySubset::new_with_shape(output_shape.to_vec()).indices(); + let indices = ArraySubset::new_with_shape(input_shape.to_vec()).indices(); for (chunk_index, indices) in indices.iter().enumerate() { offsets_new.push(bytes_new.len()); - if subset.contains(&indices) { + if update_subset.contains(&indices) { let subset_indices = indices .iter() - .zip(subset.start()) + .zip(update_subset.start()) .map(|(i, s)| i - s) .collect::>(); let subset_index = - usize::try_from(ravel_indices(&subset_indices, subset.shape())).unwrap(); - let start = subset_offsets[subset_index]; - let end = subset_offsets[subset_index + 1]; - bytes_new.extend_from_slice(&subset_bytes[start..end]); + usize::try_from(ravel_indices(&subset_indices, update_subset.shape())).unwrap(); + let start = update_offsets[subset_index]; + let end = update_offsets[subset_index + 1]; + bytes_new.extend_from_slice(&update_bytes[start..end]); } else { - let start = output_offsets[chunk_index]; - let end = output_offsets[chunk_index + 1]; - bytes_new.extend_from_slice(&output_bytes[start..end]); + let start = input_offsets[chunk_index]; + let end = input_offsets[chunk_index + 1]; + bytes_new.extend_from_slice(&input_bytes[start..end]); } } offsets_new.push(bytes_new.len()); - ArrayBytes::new_vlen(bytes_new, offsets_new) + Ok(ArrayBytes::new_vlen(bytes_new, offsets_new)) } /// Update a subset of an array. /// /// This function is used internally by [`crate::array::Array::store_chunk_subset_opt`] and [`crate::array::Array::async_store_chunk_subset_opt`]. /// -/// # Safety -/// The caller must ensure that: -/// - `output_bytes` is an array with `output_shape` and `data_type_size`, -/// - `output_subset_bytes` is an array with the shape of `output_subset` and `data_type_size`, -/// - `output_subset` is within the bounds of `output_shape`, and -/// - `output_bytes` and `output_subset_bytes` are compatible (e.g. both fixed or both variable sized). -#[must_use] -pub unsafe fn update_array_bytes<'a>( +/// # Errors +/// Returns a [`CodecError`] if +/// - `output_bytes` are not compatible with the `output_shape` and `data_type_size`, +/// - `output_subset_bytes` are not compatible with the `output_subset` and `data_type_size`, +/// - `output_subset` is not within the bounds of `output_shape` +pub fn update_array_bytes<'a>( output_bytes: ArrayBytes, output_shape: &[u64], output_subset: &ArraySubset, output_subset_bytes: &ArrayBytes, data_type_size: DataTypeSize, -) -> ArrayBytes<'a> { +) -> Result, CodecError> { match (output_bytes, output_subset_bytes, data_type_size) { ( ArrayBytes::Variable(chunk_bytes, chunk_offsets), ArrayBytes::Variable(chunk_subset_bytes, chunk_subset_offsets), DataTypeSize::Variable, - ) => update_bytes_vlen( + ) => Ok(update_bytes_vlen( &chunk_bytes, &chunk_offsets, output_shape, chunk_subset_bytes, chunk_subset_offsets, output_subset, - ), + )?), ( ArrayBytes::Fixed(chunk_bytes), ArrayBytes::Fixed(chunk_subset_bytes), DataTypeSize::Fixed(data_type_size), ) => { let mut chunk_bytes = chunk_bytes.into_owned(); - { - let chunk_bytes = UnsafeCellSlice::new(&mut chunk_bytes); - update_bytes_flen( - &chunk_bytes, - output_shape, - chunk_subset_bytes, - output_subset, + let mut output_view = unsafe { + // SAFETY: Only one view is created, so it is disjoint + ArrayBytesFixedDisjointView::new( + UnsafeCellSlice::new(&mut chunk_bytes), data_type_size, - ); + output_shape, + output_subset.clone(), + ) } - ArrayBytes::new_flen(chunk_bytes) - } - (_, _, _) => { - unreachable!("Validation should occur outside of this function") + .map_err(CodecError::from)?; + output_view.copy_from_slice(chunk_subset_bytes)?; + Ok(ArrayBytes::new_flen(chunk_bytes)) } + (_, _, DataTypeSize::Variable) => Err(CodecError::ExpectedVariableLengthBytes), + (_, _, DataTypeSize::Fixed(_)) => Err(CodecError::ExpectedFixedLengthBytes), } } @@ -541,25 +503,15 @@ pub(crate) fn extract_decoded_regions_vlen<'a>( /// - `data_type` and `fill_value` are compatible, /// - `output` holds enough space for the preallocated bytes of an array with `output_shape` and `data_type`, and /// - `output_subset` is within the bounds of `output_shape`. -pub unsafe fn copy_fill_value_into( +pub fn copy_fill_value_into( data_type: &DataType, fill_value: &FillValue, - output: &UnsafeCellSlice, - output_shape: &[u64], - output_subset: &ArraySubset, + output_view: &mut ArrayBytesFixedDisjointView, ) -> Result<(), CodecError> { - let array_size = ArraySize::new(data_type.size(), output_subset.num_elements()); - if let (ArrayBytes::Fixed(fill_value_bytes), Some(data_type_size)) = ( - ArrayBytes::new_fill_value(array_size, fill_value), - data_type.fixed_size(), - ) { - update_bytes_flen( - output, - output_shape, - &fill_value_bytes, - output_subset, - data_type_size, - ); + let array_size = ArraySize::new(data_type.size(), output_view.num_elements()); + if let ArrayBytes::Fixed(fill_value_bytes) = ArrayBytes::new_fill_value(array_size, fill_value) + { + output_view.copy_from_slice(&fill_value_bytes)?; Ok(()) } else { // TODO: Variable length data type support? @@ -654,21 +606,27 @@ mod tests { let mut bytes_array = vec![0u8; 4 * 4]; { let bytes_array = UnsafeCellSlice::new(&mut bytes_array); - update_bytes_flen( - &bytes_array, - &vec![4, 4], - &vec![1u8, 2].into(), - &ArraySubset::new_with_ranges(&[1..2, 1..3]), - 1, - ); - - update_bytes_flen( - &bytes_array, - &vec![4, 4], - &vec![3u8, 4].into(), - &ArraySubset::new_with_ranges(&[3..4, 0..2]), - 1, - ); + let mut output_non_overlapping_0 = unsafe { + // SAFETY: Only one view is created, so it is disjoint + ArrayBytesFixedDisjointView::new_unchecked( + bytes_array, + size_of::(), + &[4, 4], + ArraySubset::new_with_ranges(&[1..2, 1..3]), + ) + }; + output_non_overlapping_0.copy_from_slice(&[1u8, 2]).unwrap(); + + let mut output_non_overlapping_1 = unsafe { + // SAFETY: Only one view is created, so it is disjoint + ArrayBytesFixedDisjointView::new_unchecked( + bytes_array, + size_of::(), + &[4, 4], + ArraySubset::new_with_ranges(&[3..4, 0..2]), + ) + }; + output_non_overlapping_1.copy_from_slice(&[3u8, 4]).unwrap(); } debug_assert_eq!( diff --git a/zarrs/src/array/array_bytes_fixed_disjoint_view.rs b/zarrs/src/array/array_bytes_fixed_disjoint_view.rs new file mode 100644 index 00000000..7e5caf8c --- /dev/null +++ b/zarrs/src/array/array_bytes_fixed_disjoint_view.rs @@ -0,0 +1,354 @@ +use derive_more::derive::Display; +use thiserror::Error; +use unsafe_cell_slice::UnsafeCellSlice; + +use crate::array_subset::{ + iterators::{ContiguousIndices, ContiguousLinearisedIndices}, + ArraySubset, +}; + +use super::codec::{CodecError, InvalidBytesLengthError, SubsetOutOfBoundsError}; + +/// A disjoint view of the bytes in an array with a fixed-length data type. +/// +/// The `subset` represented by this view must not overlap with the `subset` of any other created views that reference the same array bytes. +pub struct ArrayBytesFixedDisjointView<'a> { + bytes: UnsafeCellSlice<'a, u8>, + data_type_size: usize, + shape: &'a [u64], + subset: ArraySubset, + bytes_in_subset_len: usize, +} + +/// Errors that can occur when creating a [`ArrayBytesFixedDisjointView`]. +#[derive(Debug, Error, Display)] +pub enum ArrayBytesFixedDisjointViewCreateError { + /// The subset is out-of-bounds of the array shape. + SubsetOutOfBounds(#[from] SubsetOutOfBoundsError), + /// The length of the bytes is not the correct length. + InvalidBytesLength(#[from] InvalidBytesLengthError), +} + +impl From for CodecError { + fn from(value: ArrayBytesFixedDisjointViewCreateError) -> Self { + match value { + ArrayBytesFixedDisjointViewCreateError::SubsetOutOfBounds(e) => e.into(), + ArrayBytesFixedDisjointViewCreateError::InvalidBytesLength(e) => e.into(), + } + } +} + +impl<'a> ArrayBytesFixedDisjointView<'a> { + /// Create a new non-overlapping view of the bytes in an array. + /// + /// # Errors + /// Returns [`ArrayBytesFixedDisjointViewCreateError`] if + /// - `subset` is out-of-bounds of `shape`, or + /// - the length of `bytes` is not the product of the elements in `shape` multiplied by `data_type_size`. + /// + /// # Safety + /// The `subset` represented by this view must not overlap with the `subset` of any other created views that reference the same array bytes. + /// + /// # Panics + /// Panics if the product of the elements in `shape` multiplied by `data_type_size` exceeds [`usize::MAX`]. + pub unsafe fn new( + bytes: UnsafeCellSlice<'a, u8>, + data_type_size: usize, + shape: &'a [u64], + subset: ArraySubset, + ) -> Result { + if !subset.inbounds_shape(shape) { + let bounding_subset = ArraySubset::new_with_shape(shape.to_vec()); + return Err(SubsetOutOfBoundsError::new(subset, bounding_subset).into()); + } + let bytes_in_array_len = + usize::try_from(shape.iter().product::()).unwrap() * data_type_size; + if bytes.len() != bytes_in_array_len { + return Err(InvalidBytesLengthError::new(bytes.len(), bytes_in_array_len).into()); + } + + let bytes_in_subset_len = subset.num_elements_usize() * data_type_size; + Ok(Self { + bytes, + data_type_size, + shape, + subset, + bytes_in_subset_len, + }) + } + + /// Create a new non-overlapping view of the bytes in an array. + /// + /// # Safety + /// - `subset` must be inbounds of `shape`, + /// - the length of `bytes` must be the product of the elements in `shape` multiplied by `data_type_size`, and + /// - the `subset` represented by this view must not overlap with the `subset` of any other created views that reference the same array bytes. + /// + /// # Panics + /// Panics if the product of the elements in `shape` multiplied by `data_type_size` exceeds [`usize::MAX`]. + #[must_use] + pub unsafe fn new_unchecked( + bytes: UnsafeCellSlice<'a, u8>, + data_type_size: usize, + shape: &'a [u64], + subset: ArraySubset, + ) -> Self { + debug_assert!(subset.inbounds_shape(shape)); + debug_assert_eq!( + bytes.len(), + usize::try_from(shape.iter().product::()).unwrap() * data_type_size + ); + + let bytes_in_subset_len = subset.num_elements_usize() * data_type_size; + Self { + bytes, + data_type_size, + shape, + subset, + bytes_in_subset_len, + } + } + + /// Create a new non-overlapping view of the bytes in an array that is a subset of the current view. + /// + /// # Errors + /// Returns [`SubsetOutOfBoundsError`] if `subset` is out-of-bounds of the parent subset. + /// + /// # Safety + /// The `subset` represented by this view must not overlap with the `subset` of any other created views that reference the same array bytes. + pub unsafe fn subdivide( + &self, + subset: ArraySubset, + ) -> Result, SubsetOutOfBoundsError> { + if !subset.inbounds(&self.subset) { + return Err(SubsetOutOfBoundsError::new(subset, self.subset.clone())); + } + + Ok(unsafe { + // SAFETY: all inputs have been validated + Self::new_unchecked(self.bytes, self.data_type_size, self.shape, subset) + }) + } + + /// Create a new non-overlapping view of the bytes in an array that is a subset of the current view. + /// + /// # Safety + /// - `subset` must be inbounds of the parent subset, and + /// - the `subset` represented by this view must not overlap with the `subset` of any other created views that reference the same array bytes. + #[must_use] + pub unsafe fn subdivide_unchecked( + &self, + subset: ArraySubset, + ) -> ArrayBytesFixedDisjointView<'a> { + debug_assert!(subset.inbounds(&self.subset)); + + unsafe { Self::new_unchecked(self.bytes, self.data_type_size, self.shape, subset) } + } + + /// Return the shape of the bytes this view is created from. + #[must_use] + pub fn shape(&self) -> &[u64] { + self.shape + } + + /// Return the subset of the bytes this view is created from. + #[must_use] + pub fn subset(&self) -> &ArraySubset { + &self.subset + } + + /// Return the number of elements in the view. + #[must_use] + pub fn num_elements(&self) -> u64 { + self.subset.num_elements() + } + + fn contiguous_indices(&self) -> ContiguousIndices { + unsafe { + // SAFETY: the output shape encapsulates the output subset, checked in constructor + self.subset.contiguous_indices_unchecked(self.shape) + } + } + + fn contiguous_linearised_indices(&self) -> ContiguousLinearisedIndices { + unsafe { + // SAFETY: the output shape encapsulates the output subset, checked in constructor + self.subset + .contiguous_linearised_indices_unchecked(self.shape) + } + } + + /// Return the contiguous element length of the view. + /// + /// This is the number of elements that are accessed in a single contiguous block. + #[must_use] + pub fn num_contiguous_elements(&self) -> usize { + self.contiguous_indices().contiguous_elements_usize() + } + + /// Return the size in bytes of contiguous elements in the view. + /// + /// This is the number of elements that are accessed in a single contiguous block. + #[must_use] + pub fn contiguous_bytes_len(&self) -> usize { + self.contiguous_indices().contiguous_elements_usize() * self.data_type_size + } + + /// Fill the view with the fill value. + /// + /// # Errors + /// Returns [`InvalidBytesLengthError`] if the length of the `fill_value` does not match the data type size. + /// + /// # Panics + /// Panics if an offset into the internal bytes reference exceeds [`usize::MAX`]. + pub fn fill(&mut self, fill_value: &[u8]) -> Result<(), InvalidBytesLengthError> { + if fill_value.len() != self.data_type_size { + return Err(InvalidBytesLengthError::new( + fill_value.len(), + self.data_type_size, + )); + } + + let fill_value_contiguous = fill_value.repeat(self.num_contiguous_elements()); + let length = self.contiguous_bytes_len(); + debug_assert_eq!(fill_value_contiguous.len(), length); + let contiguous_indices = self.contiguous_linearised_indices(); + contiguous_indices.into_iter().for_each(|index| { + let offset = usize::try_from(index * self.data_type_size as u64).unwrap(); + unsafe { + self.bytes + .index_mut(offset..offset + length) + .copy_from_slice(&fill_value_contiguous); + } + }); + Ok(()) + } + + /// Copy bytes into the view. + /// + /// The `subset_bytes` must be the same length as the byte length of the elements in the view. + /// + /// # Errors + /// Returns an [`InvalidBytesLengthError`] if the length of `subset_bytes` is not the same as the byte length of the elements in the view. + /// + /// # Panics + /// Panics if an offset into the internal bytes reference exceeds [`usize::MAX`]. + pub fn copy_from_slice(&mut self, subset_bytes: &[u8]) -> Result<(), InvalidBytesLengthError> { + if subset_bytes.len() != self.bytes_in_subset_len { + return Err(InvalidBytesLengthError::new( + subset_bytes.len(), + self.bytes_in_subset_len, + )); + } + + let contiguous_indices = self.contiguous_linearised_indices(); + let length = contiguous_indices.contiguous_elements_usize() * self.data_type_size; + + let bytes_copied = contiguous_indices.into_iter().fold( + 0, + |subset_offset: usize, array_subset_element_index: u64| { + let output_offset = + usize::try_from(array_subset_element_index).unwrap() * self.data_type_size; + debug_assert!((output_offset + length) <= self.bytes.len()); + debug_assert!((subset_offset + length) <= subset_bytes.len()); + let subset_offset_end = subset_offset + length; + unsafe { + self.bytes + .index_mut(output_offset..output_offset + length) + .copy_from_slice(&subset_bytes[subset_offset..subset_offset_end]); + } + subset_offset_end + }, + ); + debug_assert_eq!(bytes_copied, subset_bytes.len()); + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn disjoint_view() { + let mut bytes = (0..9).collect::>(); + let shape = vec![3, 3]; + { + let bytes = UnsafeCellSlice::new(&mut bytes); + + assert!(unsafe { + ArrayBytesFixedDisjointView::new( + bytes, + 1, + &[10, 10], + ArraySubset::new_with_ranges(&[0..2, 1..3]), + ) + } + .is_err()); // incompatible shape + assert!(unsafe { + ArrayBytesFixedDisjointView::new( + bytes, + 2, + &shape, + ArraySubset::new_with_ranges(&[0..2, 1..3]), + ) + } + .is_err()); // invalid bytes length + assert!(unsafe { + ArrayBytesFixedDisjointView::new( + bytes, + 1, + &shape, + ArraySubset::new_with_ranges(&[0..2, 1..10]), + ) + } + .is_err()); // OOB + + let mut view0 = unsafe { + ArrayBytesFixedDisjointView::new( + bytes, + 1, + &shape, + ArraySubset::new_with_ranges(&[0..2, 1..3]), + ) + } + .unwrap(); + assert_eq!(view0.shape(), shape); + + view0.copy_from_slice(&[11, 12, 14, 15]).unwrap(); + assert!(view0.copy_from_slice(&[11, 12, 14, 15, 255]).is_err()); // wrong length + + let mut view0a = + unsafe { view0.subdivide(ArraySubset::new_with_ranges(&[1..2, 1..3])) }.unwrap(); + view0a.copy_from_slice(&[24, 25]).unwrap(); + assert!(view0a.copy_from_slice(&[]).is_err()); // wrong length + + assert!( + unsafe { view0a.subdivide(ArraySubset::new_with_ranges(&[1..2, 1..3])) }.is_ok() + ); + assert!( + unsafe { view0a.subdivide(ArraySubset::new_with_ranges(&[1..2, 2..3])) }.is_ok() + ); + assert!( + unsafe { view0a.subdivide(ArraySubset::new_with_ranges(&[0..2, 1..3])) }.is_err() + ); // OOB + assert!( + unsafe { view0a.subdivide(ArraySubset::new_with_ranges(&[1..2, 1..4])) }.is_err() + ); // OOB + + let mut view1 = unsafe { + ArrayBytesFixedDisjointView::new( + bytes, + 1, + &shape, + ArraySubset::new_with_ranges(&[2..3, 1..3]), + ) + } + .unwrap(); + view1.fill(&[255]).unwrap(); + assert!(view1.fill(&[255, 255]).is_err()); // invalid fill value + } + assert_eq!(&bytes, &[0, 11, 12, 3, 24, 25, 6, 255, 255]); + } +} diff --git a/zarrs/src/array/array_sync_readable.rs b/zarrs/src/array/array_sync_readable.rs index a2b8d4b2..d699e944 100644 --- a/zarrs/src/array/array_sync_readable.rs +++ b/zarrs/src/array/array_sync_readable.rs @@ -20,7 +20,8 @@ use super::{ }, concurrency::concurrency_chunks_and_codec, element::ElementOwned, - Array, ArrayCreateError, ArrayError, ArrayMetadata, ArrayMetadataV3, ArraySize, DataTypeSize, + Array, ArrayBytesFixedDisjointView, ArrayCreateError, ArrayError, ArrayMetadata, + ArrayMetadataV3, ArraySize, DataTypeSize, }; #[cfg(feature = "ndarray")] @@ -458,12 +459,10 @@ impl Array { } } - unsafe fn retrieve_chunk_into( + fn retrieve_chunk_into( &self, chunk_indices: &[u64], - output: &UnsafeCellSlice, - output_shape: &[u64], - output_subset: &ArraySubset, + output_view: &mut ArrayBytesFixedDisjointView<'_>, options: &CodecOptions, ) -> Result<(), ArrayError> { if chunk_indices.len() != self.dimensionality() { @@ -481,28 +480,17 @@ impl Array { if let Some(chunk_encoded) = chunk_encoded { let chunk_encoded: Vec = chunk_encoded.into(); let chunk_representation = self.chunk_array_representation(chunk_indices)?; - unsafe { - self.codecs().decode_into( + self.codecs() + .decode_into( Cow::Owned(chunk_encoded), &chunk_representation, - output, - output_shape, - output_subset, + output_view, options, ) - } - .map_err(ArrayError::CodecError) + .map_err(ArrayError::CodecError) } else { - unsafe { - copy_fill_value_into( - self.data_type(), - self.fill_value(), - output, - output_shape, - output_subset, - ) - } - .map_err(ArrayError::CodecError) + copy_fill_value_into(self.data_type(), self.fill_value(), output_view) + .map_err(ArrayError::CodecError) } } @@ -720,16 +708,21 @@ impl Array { let retrieve_chunk = |chunk_indices: Vec| { let chunk_subset = self.chunk_subset(&chunk_indices)?; let chunk_subset_overlap = chunk_subset.overlap(array_subset)?; - unsafe { - self.retrieve_chunk_subset_into( - &chunk_indices, - &chunk_subset_overlap.relative_to(chunk_subset.start())?, - &output, + let mut output_view = unsafe { + // SAFETY: chunks represent disjoint array subsets + ArrayBytesFixedDisjointView::new_unchecked( + output, + data_type_size, array_subset.shape(), - &chunk_subset_overlap.relative_to(array_subset.start())?, - &options, - )?; - } + chunk_subset_overlap.relative_to(array_subset.start())?, + ) + }; + self.retrieve_chunk_subset_into( + &chunk_indices, + &chunk_subset_overlap.relative_to(chunk_subset.start())?, + &mut output_view, + &options, + )?; // let chunk_subset_bytes = self.retrieve_chunk_subset_opt( // &chunk_indices, // &chunk_subset_overlap.relative_to(chunk_subset.start())?, @@ -827,13 +820,11 @@ impl Array { Ok(bytes) } - unsafe fn retrieve_chunk_subset_into( + fn retrieve_chunk_subset_into( &self, chunk_indices: &[u64], chunk_subset: &ArraySubset, - output: &UnsafeCellSlice, - output_shape: &[u64], - output_subset: &ArraySubset, + output_view: &mut ArrayBytesFixedDisjointView<'_>, options: &CodecOptions, ) -> Result<(), ArrayError> { let chunk_representation = self.chunk_array_representation(chunk_indices)?; @@ -848,15 +839,7 @@ impl Array { && chunk_subset.shape() == chunk_representation.shape_u64() { // Fast path if `chunk_subset` encompasses the whole chunk - unsafe { - self.retrieve_chunk_into( - chunk_indices, - output, - output_shape, - output_subset, - options, - ) - } + self.retrieve_chunk_into(chunk_indices, output_view, options) } else { let storage_handle = Arc::new(StorageHandle::new(self.storage.clone())); let storage_transformer = self @@ -866,19 +849,10 @@ impl Array { storage_transformer, self.chunk_key(chunk_indices), )); - - unsafe { - self.codecs - .clone() - .partial_decoder(input_handle, &chunk_representation, options)? - .partial_decode_into( - chunk_subset, - output, - output_shape, - output_subset, - options, - )?; - } + self.codecs + .clone() + .partial_decoder(input_handle, &chunk_representation, options)? + .partial_decode_into(chunk_subset, output_view, options)?; Ok(()) } } diff --git a/zarrs/src/array/array_sync_readable_writable.rs b/zarrs/src/array/array_sync_readable_writable.rs index 520cb933..478f93a2 100644 --- a/zarrs/src/array/array_sync_readable_writable.rs +++ b/zarrs/src/array/array_sync_readable_writable.rs @@ -200,15 +200,13 @@ impl Array chunk_bytes_old.validate(chunk_shape.iter().product(), self.data_type().size())?; // Update the chunk - let chunk_bytes_new = unsafe { - update_array_bytes( - chunk_bytes_old, - &chunk_shape, - chunk_subset, - &chunk_subset_bytes, - self.data_type().size(), - ) - }; + let chunk_bytes_new = update_array_bytes( + chunk_bytes_old, + &chunk_shape, + chunk_subset, + &chunk_subset_bytes, + self.data_type().size(), + )?; // Store the updated chunk self.store_chunk_opt(chunk_indices, chunk_bytes_new, options) diff --git a/zarrs/src/array/array_sync_sharded_readable_ext.rs b/zarrs/src/array/array_sync_sharded_readable_ext.rs index 461df909..04ef4d4e 100644 --- a/zarrs/src/array/array_sync_sharded_readable_ext.rs +++ b/zarrs/src/array/array_sync_sharded_readable_ext.rs @@ -7,7 +7,7 @@ use zarrs_metadata::v3::array::codec::sharding::ShardingCodecConfiguration; use zarrs_storage::byte_range::ByteRange; use zarrs_storage::StorageHandle; -use super::array_bytes::{merge_chunks_vlen, update_bytes_flen}; +use super::array_bytes::merge_chunks_vlen; use super::codec::array_to_bytes::sharding::ShardingPartialDecoder; use super::codec::{CodecError, ShardingCodec}; use super::element::ElementOwned; @@ -15,7 +15,7 @@ use super::{ codec::CodecOptions, concurrency::concurrency_chunks_and_codec, Array, ArrayError, ArrayShardedExt, ChunkGrid, }; -use super::{ArrayBytes, ArraySize, DataTypeSize}; +use super::{ArrayBytes, ArrayBytesFixedDisjointView, ArraySize, DataTypeSize}; use crate::array::codec::StoragePartialDecoder; use crate::storage::ReadableStorageTraits; use crate::{array::codec::ArrayPartialDecoderTraits, array_subset::ArraySubset}; @@ -612,13 +612,18 @@ impl ArrayShardedReadableExt )? .remove(0) .into_owned(); - update_bytes_flen( - &output, - array_subset.shape(), - &bytes.into_fixed()?, - &shard_subset_overlap.relative_to(array_subset.start())?, - data_type_size, - ); + let mut output_view = unsafe { + // SAFETY: chunks represent disjoint array subsets + ArrayBytesFixedDisjointView::new_unchecked( + output, + data_type_size, + array_subset.shape(), + shard_subset_overlap.relative_to(array_subset.start())?, + ) + }; + output_view + .copy_from_slice(&bytes.into_fixed()?) + .map_err(CodecError::from)?; Ok::<_, ArrayError>(()) }; let indices = shards.indices(); diff --git a/zarrs/src/array/chunk_cache/array_chunk_cache_ext_sync.rs b/zarrs/src/array/chunk_cache/array_chunk_cache_ext_sync.rs index 03cd7036..71e4a8fa 100644 --- a/zarrs/src/array/chunk_cache/array_chunk_cache_ext_sync.rs +++ b/zarrs/src/array/chunk_cache/array_chunk_cache_ext_sync.rs @@ -6,10 +6,11 @@ use unsafe_cell_slice::UnsafeCellSlice; use crate::{ array::{ - array_bytes::{merge_chunks_vlen, update_bytes_flen}, - codec::CodecOptions, + array_bytes::merge_chunks_vlen, + codec::{CodecError, CodecOptions}, concurrency::concurrency_chunks_and_codec, - Array, ArrayBytes, ArrayError, ArraySize, DataTypeSize, ElementOwned, + Array, ArrayBytes, ArrayBytesFixedDisjointView, ArrayError, ArraySize, DataTypeSize, + ElementOwned, }, array_subset::ArraySubset, storage::ReadableStorageTraits, @@ -408,13 +409,18 @@ impl ArrayChunkCacheExt unreachable!(), }; - update_bytes_flen( - &output, - array_subset.shape(), - fixed, - &chunk_subset_overlap.relative_to(array_subset.start())?, - data_type_size, - ); + let mut output_view = unsafe { + // SAFETY: chunks represent disjoint array subsets + ArrayBytesFixedDisjointView::new_unchecked( + output, + data_type_size, + array_subset.shape(), + chunk_subset_overlap.relative_to(array_subset.start())?, + ) + }; + output_view + .copy_from_slice(fixed) + .map_err(CodecError::from)?; Ok::<_, ArrayError>(()) }; iter_concurrent_limit!( diff --git a/zarrs/src/array/chunk_grid.rs b/zarrs/src/array/chunk_grid.rs index 79ff9ee7..9f5c3ea8 100644 --- a/zarrs/src/array/chunk_grid.rs +++ b/zarrs/src/array/chunk_grid.rs @@ -127,6 +127,9 @@ impl TryFrom for ChunkGrid { } /// Chunk grid traits. +// TODO: Unsafe trait? ChunkGridTraits has invariants that must be upheld by implementations. +// - chunks must be disjoint for downstream `ArrayBytesFixedDisjoint` construction and otherwise sane behavior +// - this is true for regular and rectangular grids, but a custom grid could violate this pub trait ChunkGridTraits: core::fmt::Debug + Send + Sync { /// Create metadata. fn create_metadata(&self) -> MetadataV3; diff --git a/zarrs/src/array/codec.rs b/zarrs/src/array/codec.rs index 1eff5539..a32de1b3 100644 --- a/zarrs/src/array/codec.rs +++ b/zarrs/src/array/codec.rs @@ -70,7 +70,6 @@ pub use byte_interval_partial_decoder::ByteIntervalPartialDecoder; #[cfg(feature = "async")] pub use byte_interval_partial_decoder::AsyncByteIntervalPartialDecoder; -use unsafe_cell_slice::UnsafeCellSlice; mod array_partial_encoder_default; pub use array_partial_encoder_default::ArrayPartialEncoderDefault; @@ -98,12 +97,11 @@ use std::any::Any; use std::borrow::Cow; use std::sync::Arc; -use super::array_bytes::update_bytes_flen; use super::{ concurrency::RecommendedConcurrency, BytesRepresentation, ChunkRepresentation, ChunkShape, DataType, }; -use super::{ArrayBytes, RawBytes}; +use super::{ArrayBytes, ArrayBytesFixedDisjointView, RawBytes}; /// A codec plugin. pub type CodecPlugin = Plugin; @@ -363,35 +361,26 @@ pub trait ArrayPartialDecoderTraits: Any + Send + Sync { /// Extracted elements from the `array_subset` are written to the subset of the output in C order. /// /// # Errors - /// Returns [`CodecError`] if a codec fails or an array subset is invalid. - /// - /// # Safety - /// The caller must ensure that: - /// - `output` holds enough space for the preallocated bytes of an array with shape `output_shape` of the appropriate data type, - /// - `output_subset` is within the bounds of `output_shape`, - /// - `output_subset` has the same number of elements as `array_subset`, and - /// - `output_subset`s must be non-overlapping when called in parallel on the same `output`. - unsafe fn partial_decode_into( + /// Returns [`CodecError`] if a codec fails or the number of elements in `array_subset` does not match the number of elements in `output_view`, + fn partial_decode_into( &self, array_subset: &ArraySubset, - output: &UnsafeCellSlice, - output_shape: &[u64], - output_subset: &ArraySubset, + output_view: &mut ArrayBytesFixedDisjointView<'_>, options: &CodecOptions, ) -> Result<(), CodecError> { - debug_assert!(output_subset.inbounds_shape(output_shape)); - debug_assert_eq!(array_subset.num_elements(), output_subset.num_elements()); + if array_subset.num_elements() != output_view.num_elements() { + return Err(InvalidNumberOfElementsError::new( + array_subset.num_elements(), + output_view.num_elements(), + ) + .into()); + } + let decoded_value = self .partial_decode(&[array_subset.clone()], options)? .remove(0); if let ArrayBytes::Fixed(decoded_value) = decoded_value { - update_bytes_flen( - output, - output_shape, - &decoded_value, - output_subset, - self.data_type().fixed_size().unwrap(), - ); + output_view.copy_from_slice(&decoded_value)?; Ok(()) } else { Err(CodecError::ExpectedFixedLengthBytes) @@ -456,28 +445,25 @@ pub trait AsyncArrayPartialDecoderTraits: Any + Send + Sync { /// Async variant of [`ArrayPartialDecoderTraits::partial_decode_into`]. #[allow(clippy::missing_safety_doc)] - async unsafe fn partial_decode_into( + async fn partial_decode_into( &self, array_subset: &ArraySubset, - output: &UnsafeCellSlice, - output_shape: &[u64], - output_subset: &ArraySubset, + output_view: &mut ArrayBytesFixedDisjointView<'_>, options: &CodecOptions, ) -> Result<(), CodecError> { - debug_assert!(output_subset.inbounds_shape(output_shape)); - debug_assert_eq!(array_subset.shape(), output_subset.shape()); + if array_subset.num_elements() != output_view.num_elements() { + return Err(InvalidNumberOfElementsError::new( + output_view.num_elements(), + array_subset.num_elements(), + ) + .into()); + } let decoded_value = self .partial_decode(&[array_subset.clone()], options) .await? .remove(0); if let ArrayBytes::Fixed(decoded_value) = decoded_value { - update_bytes_flen( - output, - output_shape, - &decoded_value, - output_subset, - self.data_type().fixed_size().unwrap(), - ); + output_view.copy_from_slice(&decoded_value)?; Ok(()) } else { Err(CodecError::ExpectedFixedLengthBytes) @@ -715,37 +701,24 @@ pub trait ArrayToBytesCodecTraits: ArrayCodecTraits + core::fmt::Debug { /// Chunk elements are written to the subset of the output in C order. /// /// # Errors - /// Returns [`CodecError`] if a codec fails or the decoded output is incompatible with `decoded_representation`. - /// - /// # Safety - /// The caller must ensure that: - /// - `output` holds enough space for the preallocated bytes of an array with shape `output_shape` of the appropriate data type, and - /// - `output_subset` is within the bounds of `output_shape`, - /// - `output_subset` has the same number of elements as the decoded representation shape, and - /// - `output_subset`s must be non-overlapping when called in parallel on the same `output`. - unsafe fn decode_into( + /// Returns [`CodecError`] if a codec fails or the number of elements in `decoded_representation` does not match the number of elements in `output_view`, + fn decode_into( &self, bytes: RawBytes<'_>, decoded_representation: &ChunkRepresentation, - output: &UnsafeCellSlice, - output_shape: &[u64], - output_subset: &ArraySubset, + output_view: &mut ArrayBytesFixedDisjointView<'_>, options: &CodecOptions, ) -> Result<(), CodecError> { - debug_assert!(output_subset.inbounds_shape(output_shape)); - debug_assert_eq!( - decoded_representation.num_elements(), - output_subset.num_elements() - ); + if decoded_representation.num_elements() != output_view.num_elements() { + return Err(InvalidNumberOfElementsError::new( + output_view.num_elements(), + decoded_representation.num_elements(), + ) + .into()); + } let decoded_value = self.decode(bytes, decoded_representation, options)?; if let ArrayBytes::Fixed(decoded_value) = decoded_value { - update_bytes_flen( - output, - output_shape, - &decoded_value, - output_subset, - decoded_representation.data_type().fixed_size().unwrap(), - ); + output_view.copy_from_slice(&decoded_value)?; } else { return Err(CodecError::ExpectedFixedLengthBytes); } diff --git a/zarrs/src/array/codec/array_partial_encoder_default.rs b/zarrs/src/array/codec/array_partial_encoder_default.rs index 407c07ab..98b78b20 100644 --- a/zarrs/src/array/codec/array_partial_encoder_default.rs +++ b/zarrs/src/array/codec/array_partial_encoder_default.rs @@ -76,15 +76,13 @@ impl ArrayPartialEncoderTraits for ArrayPartialEncoderDefault { self.decoded_representation.data_type().size(), )?; - chunk_bytes = unsafe { - update_array_bytes( - chunk_bytes, - &chunk_shape, - chunk_subset, - chunk_subset_bytes, - self.decoded_representation.data_type().size(), - ) - }; + chunk_bytes = update_array_bytes( + chunk_bytes, + &chunk_shape, + chunk_subset, + chunk_subset_bytes, + self.decoded_representation.data_type().size(), + )?; } let is_fill_value = !options.store_empty_chunks() diff --git a/zarrs/src/array/codec/array_to_array_partial_encoder_default.rs b/zarrs/src/array/codec/array_to_array_partial_encoder_default.rs index 19e76fc5..6ded2b0b 100644 --- a/zarrs/src/array/codec/array_to_array_partial_encoder_default.rs +++ b/zarrs/src/array/codec/array_to_array_partial_encoder_default.rs @@ -86,15 +86,13 @@ impl ArrayPartialEncoderTraits for ArrayToArrayPartialEncoderDefault { self.decoded_representation.data_type().size(), )?; - decoded_value = unsafe { - update_array_bytes( - decoded_value, - &chunk_shape, - chunk_subset, - chunk_subset_bytes, - self.decoded_representation.data_type().size(), - ) - }; + decoded_value = update_array_bytes( + decoded_value, + &chunk_shape, + chunk_subset, + chunk_subset_bytes, + self.decoded_representation.data_type().size(), + )?; } let is_fill_value = !options.store_empty_chunks() diff --git a/zarrs/src/array/codec/array_to_bytes/codec_chain.rs b/zarrs/src/array/codec/array_to_bytes/codec_chain.rs index 804b683f..8fc9bd95 100644 --- a/zarrs/src/array/codec/array_to_bytes/codec_chain.rs +++ b/zarrs/src/array/codec/array_to_bytes/codec_chain.rs @@ -2,11 +2,8 @@ use std::sync::Arc; -use unsafe_cell_slice::UnsafeCellSlice; - use crate::{ array::{ - array_bytes::update_bytes_flen, codec::{ ArrayCodecTraits, ArrayPartialDecoderCache, ArrayPartialDecoderTraits, ArrayPartialEncoderTraits, ArrayToArrayCodecTraits, ArrayToBytesCodecTraits, @@ -15,9 +12,9 @@ use crate::{ CodecTraits, }, concurrency::RecommendedConcurrency, - ArrayBytes, BytesRepresentation, ChunkRepresentation, ChunkShape, RawBytes, + ArrayBytes, ArrayBytesFixedDisjointView, BytesRepresentation, ChunkRepresentation, + ChunkShape, RawBytes, }, - array_subset::ArraySubset, metadata::v3::MetadataV3, plugin::PluginCreateError, }; @@ -309,13 +306,11 @@ impl ArrayToBytesCodecTraits for CodecChain { Ok(bytes) } - unsafe fn decode_into( + fn decode_into( &self, mut bytes: RawBytes<'_>, decoded_representation: &ChunkRepresentation, - output: &UnsafeCellSlice, - output_shape: &[u64], - output_subset: &ArraySubset, + output_view: &mut ArrayBytesFixedDisjointView<'_>, options: &CodecOptions, ) -> Result<(), CodecError> { let array_representations = @@ -325,16 +320,12 @@ impl ArrayToBytesCodecTraits for CodecChain { if self.bytes_to_bytes.is_empty() && self.array_to_array.is_empty() { // Fast path if no bytes to bytes or array to array codecs - return unsafe { - self.array_to_bytes.decode_into( - bytes, - array_representations.last().unwrap(), - output, - output_shape, - output_subset, - options, - ) - }; + return self.array_to_bytes.decode_into( + bytes, + array_representations.last().unwrap(), + output_view, + options, + ); } // bytes->bytes @@ -347,16 +338,12 @@ impl ArrayToBytesCodecTraits for CodecChain { if self.array_to_array.is_empty() { // Fast path if no array to array codecs - return unsafe { - self.array_to_bytes.decode_into( - bytes, - array_representations.last().unwrap(), - output, - output_shape, - output_subset, - options, - ) - }; + return self.array_to_bytes.decode_into( + bytes, + array_representations.last().unwrap(), + output_view, + options, + ); } // bytes->array @@ -377,13 +364,7 @@ impl ArrayToBytesCodecTraits for CodecChain { )?; if let ArrayBytes::Fixed(decoded_value) = bytes { - update_bytes_flen( - output, - output_shape, - &decoded_value, - output_subset, - decoded_representation.data_type().fixed_size().unwrap(), - ); + output_view.copy_from_slice(&decoded_value)?; } else { // TODO: Variable length data type support? return Err(CodecError::ExpectedFixedLengthBytes); diff --git a/zarrs/src/array/codec/array_to_bytes/sharding/sharding_codec.rs b/zarrs/src/array/codec/array_to_bytes/sharding/sharding_codec.rs index f432cc02..ccf76f54 100644 --- a/zarrs/src/array/codec/array_to_bytes/sharding/sharding_codec.rs +++ b/zarrs/src/array/codec/array_to_bytes/sharding/sharding_codec.rs @@ -6,7 +6,7 @@ use std::{ use crate::{ array::{ - array_bytes::{merge_chunks_vlen, update_bytes_flen}, + array_bytes::merge_chunks_vlen, chunk_shape_to_array_shape, codec::{ ArrayCodecTraits, ArrayPartialDecoderTraits, ArrayPartialEncoderTraits, @@ -15,8 +15,8 @@ use crate::{ RecommendedConcurrency, }, concurrency::calc_concurrency_outer_inner, - transmute_to_bytes_vec, unravel_index, ArrayBytes, ArraySize, BytesRepresentation, - ChunkRepresentation, ChunkShape, DataTypeSize, FillValue, RawBytes, + transmute_to_bytes_vec, unravel_index, ArrayBytes, ArrayBytesFixedDisjointView, ArraySize, + BytesRepresentation, ChunkRepresentation, ChunkShape, DataTypeSize, RawBytes, }, array_subset::ArraySubset, metadata::v3::MetadataV3, @@ -105,21 +105,6 @@ impl CodecTraits for ShardingCodec { } } -/// Repeat the fill value into a contiguous vec -/// The length is the contiguous elements of an inner chunk in the shard. See `ContiguousLinearisedIndices`. -fn get_contiguous_fill_value( - fill_value: &FillValue, - chunk_shape: &[NonZeroU64], - shard_shape: &[u64], -) -> Vec { - let chunk_subset = ArraySubset::new_with_shape(chunk_shape_to_array_shape(chunk_shape)); - let contiguous_iterator = - unsafe { chunk_subset.contiguous_linearised_indices_unchecked(shard_shape) }; - fill_value - .as_ne_bytes() - .repeat(contiguous_iterator.contiguous_elements_usize()) -} - impl ArrayCodecTraits for ShardingCodec { fn recommended_concurrency( &self, @@ -183,7 +168,6 @@ impl ArrayToBytesCodecTraits for ShardingCodec { shard_representation: &ChunkRepresentation, options: &CodecOptions, ) -> Result, CodecError> { - let shard_shape = shard_representation.shape_u64(); let chunk_representation = unsafe { ChunkRepresentation::new_unchecked( self.chunk_shape.as_slice().to_vec(), @@ -202,10 +186,6 @@ impl ArrayToBytesCodecTraits for ShardingCodec { let shard_index = self.decode_index(&encoded_shard, chunks_per_shard.as_slice(), options)?; - let any_empty = shard_index - .par_iter() - .any(|offset_or_size| *offset_or_size == u64::MAX); - // Calc self/internal concurrent limits let (shard_concurrent_limit, concurrency_limit_inner_chunks) = calc_concurrency_outer_inner( options.concurrent_target(), @@ -272,49 +252,29 @@ impl ArrayToBytesCodecTraits for ShardingCodec { } let mut decoded_shard = Vec::::with_capacity(size_output); - let contiguous_fill_value = if any_empty { - Some(get_contiguous_fill_value( - shard_representation.fill_value(), - &self.chunk_shape, - &shard_shape, - )) - } else { - None - }; - { let output = UnsafeCellSlice::new_from_vec_with_spare_capacity(&mut decoded_shard); + let shard_shape = shard_representation.shape_u64(); let decode_chunk = |chunk_index: usize| { let chunk_subset = self .chunk_index_to_subset(chunk_index as u64, chunks_per_shard.as_slice()); + let mut output_view_inner_chunk = unsafe { + // SAFETY: chunks represent disjoint array subsets + ArrayBytesFixedDisjointView::new_unchecked( + output, + data_type_size, + &shard_shape, + chunk_subset, + ) + }; // Read the offset/size let offset = shard_index[chunk_index * 2]; let size = shard_index[chunk_index * 2 + 1]; if offset == u64::MAX && size == u64::MAX { - if let Some(fv) = &contiguous_fill_value { - let contiguous_iterator = unsafe { - chunk_subset - .contiguous_linearised_indices_unchecked(&shard_shape) - }; - let elements = contiguous_iterator.contiguous_elements(); - for index in &contiguous_iterator { - debug_assert_eq!( - fv.len() as u64, - elements * data_type_size as u64 - ); - let shard_offset = - usize::try_from(index * data_type_size as u64).unwrap(); - unsafe { - output - .index_mut(shard_offset..shard_offset + fv.len()) - .copy_from_slice(fv); - } - } - } else { - unreachable!(); - } + output_view_inner_chunk + .fill(shard_representation.fill_value().as_ne_bytes())?; } else if usize::try_from(offset + size).unwrap() > encoded_shard.len() { return Err(CodecError::Other( "The shard index references out-of-bounds bytes. The chunk may be corrupted." @@ -329,13 +289,9 @@ impl ArrayToBytesCodecTraits for ShardingCodec { &chunk_representation, &options, )?; - update_bytes_flen( - &output, - &shard_representation.shape_u64(), - &decoded_chunk.into_fixed()?, - &chunk_subset, - data_type_size, - ); + output_view_inner_chunk + .copy_from_slice(&decoded_chunk.into_fixed()?) + .map_err(CodecError::from)?; }; Ok::<_, CodecError>(()) @@ -355,16 +311,13 @@ impl ArrayToBytesCodecTraits for ShardingCodec { } #[allow(clippy::too_many_lines)] - unsafe fn decode_into( + fn decode_into( &self, encoded_shard: RawBytes<'_>, shard_representation: &ChunkRepresentation, - output: &UnsafeCellSlice, - output_shape: &[u64], - output_subset: &ArraySubset, + output_view: &mut ArrayBytesFixedDisjointView<'_>, options: &CodecOptions, ) -> Result<(), CodecError> { - let shard_shape = shard_representation.shape_u64(); let chunk_representation = unsafe { ChunkRepresentation::new_unchecked( self.chunk_shape.as_slice().to_vec(), @@ -383,10 +336,6 @@ impl ArrayToBytesCodecTraits for ShardingCodec { let shard_index = self.decode_index(&encoded_shard, chunks_per_shard.as_slice(), options)?; - let any_empty = shard_index - .par_iter() - .any(|offset_or_size| *offset_or_size == u64::MAX); - // Calc self/internal concurrent limits let (shard_concurrent_limit, concurrency_limit_inner_chunks) = calc_concurrency_outer_inner( options.concurrent_target(), @@ -400,96 +349,55 @@ impl ArrayToBytesCodecTraits for ShardingCodec { .concurrent_target(concurrency_limit_inner_chunks) .build(); - match shard_representation.data_type().size() { - DataTypeSize::Variable => { - // TODO: Variable length data type support? - Err(CodecError::ExpectedFixedLengthBytes) - } - DataTypeSize::Fixed(data_type_size) => { - let contiguous_fill_value = if any_empty { - Some(get_contiguous_fill_value( - shard_representation.fill_value(), - &self.chunk_shape, - &shard_shape, - )) - } else { - None - }; - - { - let decode_chunk = |chunk_index: usize| { - let chunk_subset = self - .chunk_index_to_subset(chunk_index as u64, chunks_per_shard.as_slice()); + let decode_chunk = |chunk_index: usize| { + let chunk_subset = + self.chunk_index_to_subset(chunk_index as u64, chunks_per_shard.as_slice()); - let output_subset_chunk = ArraySubset::new_with_start_shape( - std::iter::zip(output_subset.start(), chunk_subset.start()) - .map(|(o, s)| o + s) - .collect(), - chunk_subset.shape().to_vec(), - ) - .unwrap(); + let output_subset_chunk = ArraySubset::new_with_start_shape( + std::iter::zip(output_view.subset().start(), chunk_subset.start()) + .map(|(o, s)| o + s) + .collect(), + chunk_subset.shape().to_vec(), + ) + .unwrap(); + let mut output_view_inner_chunk = unsafe { + // SAFETY: inner chunks represent disjoint array subsets + output_view.subdivide_unchecked(output_subset_chunk) + }; - // Read the offset/size - let offset = shard_index[chunk_index * 2]; - let size = shard_index[chunk_index * 2 + 1]; - if offset == u64::MAX && size == u64::MAX { - if let Some(fv) = &contiguous_fill_value { - let contiguous_iterator = unsafe { - output_subset_chunk - .contiguous_linearised_indices_unchecked(output_shape) - }; - let elements = contiguous_iterator.contiguous_elements(); - for index in &contiguous_iterator { - debug_assert_eq!( - fv.len() as u64, - elements * data_type_size as u64 - ); - let shard_offset = - usize::try_from(index * data_type_size as u64).unwrap(); - unsafe { - output - .index_mut(shard_offset..shard_offset + fv.len()) - .copy_from_slice(fv); - } - } - } else { - unreachable!(); - } - } else if usize::try_from(offset + size).unwrap() > encoded_shard.len() { - return Err(CodecError::Other( - "The shard index references out-of-bounds bytes. The chunk may be corrupted." - .to_string(), - )); - } else { - let offset: usize = offset.try_into().unwrap(); - let size: usize = size.try_into().unwrap(); - let encoded_chunk = &encoded_shard[offset..offset + size]; - unsafe { - self.inner_codecs.decode_into( - Cow::Borrowed(encoded_chunk), - &chunk_representation, - output, - output_shape, - &output_subset_chunk, - &options, - )?; - } - }; + // Read the offset/size + let offset = shard_index[chunk_index * 2]; + let size = shard_index[chunk_index * 2 + 1]; + if offset == u64::MAX && size == u64::MAX { + output_view_inner_chunk.fill(shard_representation.fill_value().as_ne_bytes())?; + } else if usize::try_from(offset + size).unwrap() > encoded_shard.len() { + return Err(CodecError::Other( + "The shard index references out-of-bounds bytes. The chunk may be corrupted." + .to_string(), + )); + } else { + let offset: usize = offset.try_into().unwrap(); + let size: usize = size.try_into().unwrap(); + let encoded_chunk = &encoded_shard[offset..offset + size]; + self.inner_codecs.decode_into( + Cow::Borrowed(encoded_chunk), + &chunk_representation, + &mut output_view_inner_chunk, + &options, + )?; + }; - Ok::<_, CodecError>(()) - }; + Ok::<_, CodecError>(()) + }; - rayon_iter_concurrent_limit::iter_concurrent_limit!( - shard_concurrent_limit, - (0..num_chunks), - try_for_each, - decode_chunk - )?; + rayon_iter_concurrent_limit::iter_concurrent_limit!( + shard_concurrent_limit, + (0..num_chunks), + try_for_each, + decode_chunk + )?; - Ok(()) - } - } - } + Ok(()) } fn partial_decoder( diff --git a/zarrs/src/array/codec/array_to_bytes/sharding/sharding_partial_decoder.rs b/zarrs/src/array/codec/array_to_bytes/sharding/sharding_partial_decoder.rs index bc90361e..303b70b0 100644 --- a/zarrs/src/array/codec/array_to_bytes/sharding/sharding_partial_decoder.rs +++ b/zarrs/src/array/codec/array_to_bytes/sharding/sharding_partial_decoder.rs @@ -5,15 +5,15 @@ use unsafe_cell_slice::UnsafeCellSlice; use zarrs_storage::byte_range::ByteRange; use crate::array::{ - array_bytes::{merge_chunks_vlen, update_bytes_flen}, + array_bytes::merge_chunks_vlen, codec::{ ArrayCodecTraits, ArrayPartialDecoderTraits, ArraySubset, ArrayToBytesCodecTraits, ByteIntervalPartialDecoder, BytesPartialDecoderTraits, CodecChain, CodecError, CodecOptions, }, concurrency::{calc_concurrency_outer_inner, RecommendedConcurrency}, - ravel_indices, ArrayBytes, ArraySize, ChunkRepresentation, ChunkShape, DataType, DataTypeSize, - RawBytes, + ravel_indices, ArrayBytes, ArrayBytesFixedDisjointView, ArraySize, ChunkRepresentation, + ChunkShape, DataType, DataTypeSize, RawBytes, }; #[cfg(feature = "async")] @@ -305,16 +305,20 @@ impl ArrayPartialDecoderTraits for ShardingPartialDecoder { .into_owned() }; let decoded_bytes = decoded_bytes.into_fixed()?; - update_bytes_flen( - &out_array_subset_slice, - array_subset.shape(), - &decoded_bytes, - &chunk_subset_overlap - .relative_to(array_subset.start()) - .unwrap(), - data_type_size, - ); - Ok::<_, CodecError>(()) + let mut output_view = unsafe { + // SAFETY: chunks represent disjoint array subsets + ArrayBytesFixedDisjointView::new_unchecked( + out_array_subset_slice, + data_type_size, + array_subset.shape(), + chunk_subset_overlap + .relative_to(array_subset.start()) + .unwrap(), + ) + }; + output_view + .copy_from_slice(&decoded_bytes) + .map_err(CodecError::from) }; rayon_iter_concurrent_limit::iter_concurrent_limit!( @@ -597,15 +601,20 @@ impl AsyncArrayPartialDecoderTraits for AsyncShardingPartialDecoder { Vec, ArraySubset, ) = subset_and_decoded_chunk?; - update_bytes_flen( - &shard_slice, - array_subset.shape(), - &chunk_subset_bytes.into(), - &chunk_subset_overlap - .relative_to(array_subset.start()) - .unwrap(), - data_type_size, - ); + let mut output_view = unsafe { + // SAFETY: chunks represent disjoint array subsets + ArrayBytesFixedDisjointView::new_unchecked( + shard_slice, + data_type_size, + array_subset.shape(), + chunk_subset_overlap + .relative_to(array_subset.start()) + .unwrap(), + ) + }; + output_view + .copy_from_slice(&chunk_subset_bytes) + .expect("chunk subset bytes are the correct length"); Ok::<_, CodecError>(()) } )?; @@ -627,26 +636,26 @@ impl AsyncArrayPartialDecoderTraits for AsyncShardingPartialDecoder { rayon_iter_concurrent_limit::iter_concurrent_limit!( options.concurrent_target(), filled_chunks, - for_each, + try_for_each, |chunk_subset: &ArraySubset| { let chunk_subset_overlap = unsafe { array_subset.overlap_unchecked(chunk_subset) }; - let filled_chunk = self - .decoded_representation - .fill_value() - .as_ne_bytes() - .repeat(chunk_subset_overlap.num_elements_usize()); - update_bytes_flen( - &shard_slice, - array_subset.shape(), - &filled_chunk.into(), - &chunk_subset_overlap - .relative_to(array_subset.start()) - .unwrap(), - data_type_size, - ); + let mut output_view = unsafe { + // SAFETY: chunks represent disjoint array subsets + ArrayBytesFixedDisjointView::new_unchecked( + shard_slice, + data_type_size, + array_subset.shape(), + chunk_subset_overlap + .relative_to(array_subset.start()) + .unwrap(), + ) + }; + output_view + .fill(self.decoded_representation.fill_value().as_ne_bytes()) + .map_err(CodecError::from) } - ); + )?; }; unsafe { shard.set_len(shard_size) }; out.push(ArrayBytes::from(shard)); diff --git a/zarrs/src/array/codec/array_to_bytes/sharding/sharding_partial_encoder.rs b/zarrs/src/array/codec/array_to_bytes/sharding/sharding_partial_encoder.rs index 1dc04e3c..b20b7263 100644 --- a/zarrs/src/array/codec/array_to_bytes/sharding/sharding_partial_encoder.rs +++ b/zarrs/src/array/codec/array_to_bytes/sharding/sharding_partial_encoder.rs @@ -303,17 +303,15 @@ impl ArrayPartialEncoderTraits for ShardingPartialEncoder { }; // Update the inner chunk - let inner_chunk_updated = unsafe { - update_array_bytes( - inner_chunk_decoded, - &self.inner_chunk_representation.shape_u64(), - &inner_chunk_subset_overlap - .relative_to(inner_chunk_subset.start()) - .unwrap(), - &inner_chunk_bytes, - self.inner_chunk_representation.data_type().size(), - ) - }; + let inner_chunk_updated = update_array_bytes( + inner_chunk_decoded, + &self.inner_chunk_representation.shape_u64(), + &inner_chunk_subset_overlap + .relative_to(inner_chunk_subset.start()) + .unwrap(), + &inner_chunk_bytes, + self.inner_chunk_representation.data_type().size(), + )?; inner_chunks_decoded .lock() .unwrap() From 177f701c5905eb16791ebd01cbcdb3fde9711f1a Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Thu, 23 Jan 2025 08:26:24 +1100 Subject: [PATCH 23/45] refactor!: change `RawBytesOffsets` into a validated newtype (#137) Closes #132 --- CHANGELOG.md | 1 + zarrs/src/array.rs | 2 +- zarrs/src/array/array_bytes.rs | 47 +++++--- .../array/array_bytes/raw_bytes_offsets.rs | 108 ++++++++++++++++++ zarrs/src/array/codec.rs | 9 +- .../array/codec/array_to_array/transpose.rs | 4 + .../codec/array_to_bytes/vlen/vlen_codec.rs | 8 +- .../array_to_bytes/vlen_v2/vlen_v2_codec.rs | 2 + zarrs/src/array/element.rs | 12 +- 9 files changed, 167 insertions(+), 26 deletions(-) create mode 100644 zarrs/src/array/array_bytes/raw_bytes_offsets.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 189c092b..606b7eef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `ArrayToBytesCodecTraits::decode_into` - `zarrs::array::copy_fill_value_into` - `zarrs::array::update_array_bytes` +- **Breaking**: change `RawBytesOffsets` into a validated newtype ## [0.19.1] - 2025-01-19 diff --git a/zarrs/src/array.rs b/zarrs/src/array.rs index 7f1800e1..1f7b17f6 100644 --- a/zarrs/src/array.rs +++ b/zarrs/src/array.rs @@ -49,7 +49,7 @@ pub use self::{ array_builder::ArrayBuilder, array_bytes::{ copy_fill_value_into, update_array_bytes, ArrayBytes, ArrayBytesError, RawBytes, - RawBytesOffsets, + RawBytesOffsets, RawBytesOffsetsCreateError, }, array_bytes_fixed_disjoint_view::{ ArrayBytesFixedDisjointView, ArrayBytesFixedDisjointViewCreateError, diff --git a/zarrs/src/array/array_bytes.rs b/zarrs/src/array/array_bytes.rs index 023e13f4..498d66b2 100644 --- a/zarrs/src/array/array_bytes.rs +++ b/zarrs/src/array/array_bytes.rs @@ -15,6 +15,9 @@ use super::{ ravel_indices, ArrayBytesFixedDisjointView, ArraySize, DataType, FillValue, }; +mod raw_bytes_offsets; +pub use raw_bytes_offsets::{RawBytesOffsets, RawBytesOffsetsCreateError}; + /// Array element bytes. /// /// These can represent: @@ -23,11 +26,6 @@ use super::{ /// - Encoded array bytes after an array to bytes or bytes to bytes codecs. pub type RawBytes<'a> = Cow<'a, [u8]>; -/// Array element byte offsets. -/// -/// These must be monotonically increasing. See [`ArrayBytes::Variable`]. -pub type RawBytesOffsets<'a> = Cow<'a, [usize]>; // FIXME: Switch to a validated newtype in zarrs 0.20 - /// Fixed or variable length array bytes. #[derive(Clone, Debug, PartialEq, Eq)] pub enum ArrayBytes<'a> { @@ -60,15 +58,10 @@ impl<'a> ArrayBytes<'a> { } /// Create a new variable length array bytes from `bytes` and `offsets`. - pub fn new_vlen( - bytes: impl Into>, - offsets: impl Into>, // FIXME: TryInto - ) -> Self { - Self::Variable(bytes.into(), offsets.into()) + pub fn new_vlen(bytes: impl Into>, offsets: RawBytesOffsets<'a>) -> Self { + Self::Variable(bytes.into(), offsets) } - // TODO: new_vlen_unchecked - /// Create a new [`ArrayBytes`] with `num_elements` composed entirely of the `fill_value`. /// /// # Panics @@ -85,12 +78,14 @@ impl<'a> ArrayBytes<'a> { } ArraySize::Variable { num_elements } => { let num_elements = usize::try_from(num_elements).unwrap(); - Self::new_vlen( - fill_value.as_ne_bytes().repeat(num_elements), - (0..=num_elements) - .map(|i| i * fill_value.size()) - .collect::>(), - ) + Self::new_vlen(fill_value.as_ne_bytes().repeat(num_elements), unsafe { + // SAFETY: The offsets are monotonically increasing. + RawBytesOffsets::new_unchecked( + (0..=num_elements) + .map(|i| i * fill_value.size()) + .collect::>(), + ) + }) } } } @@ -207,6 +202,10 @@ impl<'a> ArrayBytes<'a> { ss_bytes.extend_from_slice(&bytes[curr..next]); } ss_offsets.push(ss_bytes.len()); + let ss_offsets = unsafe { + // SAFETY: The offsets are monotonically increasing. + RawBytesOffsets::new_unchecked(ss_offsets) + }; Ok(ArrayBytes::new_vlen(ss_bytes, ss_offsets)) } ArrayBytes::Fixed(bytes) => { @@ -334,6 +333,10 @@ pub(crate) fn update_bytes_vlen<'a>( } } offsets_new.push(bytes_new.len()); + let offsets_new = unsafe { + // SAFETY: The offsets are monotonically increasing. + RawBytesOffsets::new_unchecked(offsets_new) + }; Ok(ArrayBytes::new_vlen(bytes_new, offsets_new)) } @@ -438,6 +441,10 @@ pub(crate) fn merge_chunks_vlen<'a>( *acc += sz; Some(*acc) })); + let offsets = unsafe { + // SAFETY: The offsets are monotonically increasing. + RawBytesOffsets::new_unchecked(offsets) + }; // Write bytes // TODO: Go parallel @@ -485,6 +492,10 @@ pub(crate) fn extract_decoded_regions_vlen<'a>( region_bytes.extend_from_slice(&bytes[curr..next]); } region_offsets.push(region_bytes.len()); + let region_offsets = unsafe { + // SAFETY: The offsets are monotonically increasing. + RawBytesOffsets::new_unchecked(region_offsets) + }; out.push(ArrayBytes::new_vlen(region_bytes, region_offsets)); } Ok(out) diff --git a/zarrs/src/array/array_bytes/raw_bytes_offsets.rs b/zarrs/src/array/array_bytes/raw_bytes_offsets.rs new file mode 100644 index 00000000..4b335014 --- /dev/null +++ b/zarrs/src/array/array_bytes/raw_bytes_offsets.rs @@ -0,0 +1,108 @@ +use std::{borrow::Cow, ops::Deref}; + +use derive_more::derive::Display; +use thiserror::Error; + +/// Array element byte offsets. +/// +/// These must be monotonically increasing. See [`ArrayBytes::Variable`](crate::array::ArrayBytes::Variable). +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct RawBytesOffsets<'a>(Cow<'a, [usize]>); + +impl Deref for RawBytesOffsets<'_> { + type Target = [usize]; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +/// An error creating [`RawBytesOffsets`]. +/// +/// This error occurs when the offsets are not monotonically increasing. +#[derive(Debug, Error, Display)] +pub struct RawBytesOffsetsCreateError; + +impl<'a> RawBytesOffsets<'a> { + /// Creates a new `RawBytesOffsets`. + /// + /// # Errors + /// Returns an error if the offsets are not monotonically increasing. + pub fn new(offsets: impl Into>) -> Result { + let offsets = offsets.into(); + if offsets.windows(2).all(|w| w[1] >= w[0]) { + Ok(Self(offsets)) + } else { + Err(RawBytesOffsetsCreateError) + } + } + + /// Creates a new `RawBytesOffsets` without checking the offsets. + /// + /// # Safety + /// The offsets must be monotonically increasing. + #[must_use] + pub unsafe fn new_unchecked(offsets: impl Into>) -> Self { + let offsets = offsets.into(); + debug_assert!(offsets.windows(2).all(|w| w[1] >= w[0])); + Self(offsets) + } + + /// Clones the offsets if not already owned. + #[must_use] + pub fn into_owned(self) -> RawBytesOffsets<'static> { + RawBytesOffsets(self.0.into_owned().into()) + } +} + +impl<'a> TryFrom> for RawBytesOffsets<'a> { + type Error = RawBytesOffsetsCreateError; + + fn try_from(value: Cow<'a, [usize]>) -> Result { + Self::new(value) + } +} + +impl<'a> TryFrom<&'a [usize]> for RawBytesOffsets<'a> { + type Error = RawBytesOffsetsCreateError; + + fn try_from(value: &'a [usize]) -> Result { + Self::new(value) + } +} + +impl<'a, const N: usize> TryFrom<&'a [usize; N]> for RawBytesOffsets<'a> { + type Error = RawBytesOffsetsCreateError; + + fn try_from(value: &'a [usize; N]) -> Result { + Self::new(value) + } +} + +impl TryFrom> for RawBytesOffsets<'_> { + type Error = RawBytesOffsetsCreateError; + + fn try_from(value: Vec) -> Result { + Self::new(value) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn raw_bytes_offsets() { + let offsets = RawBytesOffsets::new(vec![0, 1, 2, 3]).unwrap(); + assert_eq!(&*offsets, &[0, 1, 2, 3]); + assert!(RawBytesOffsets::new(vec![0, 1, 1]).is_ok()); + assert!(RawBytesOffsets::new(vec![0, 1, 0]).is_err()); + assert!(RawBytesOffsets::try_from(vec![0, 1, 2]).is_ok()); + assert!(RawBytesOffsets::try_from(vec![0, 1, 0]).is_err()); + assert!(RawBytesOffsets::try_from([0, 1, 2].as_slice()).is_ok()); + assert!(RawBytesOffsets::try_from([0, 1, 0].as_slice()).is_err()); + assert!(RawBytesOffsets::try_from(&[0, 1, 2]).is_ok()); + assert!(RawBytesOffsets::try_from(&[0, 1, 0]).is_err()); + assert!(RawBytesOffsets::try_from(Cow::Owned(vec![0, 1, 0])).is_err()); + } +} diff --git a/zarrs/src/array/codec.rs b/zarrs/src/array/codec.rs index a32de1b3..a520f2c5 100644 --- a/zarrs/src/array/codec.rs +++ b/zarrs/src/array/codec.rs @@ -98,10 +98,10 @@ use std::borrow::Cow; use std::sync::Arc; use super::{ - concurrency::RecommendedConcurrency, BytesRepresentation, ChunkRepresentation, ChunkShape, - DataType, + array_bytes::RawBytesOffsetsCreateError, concurrency::RecommendedConcurrency, ArrayBytes, + ArrayBytesFixedDisjointView, BytesRepresentation, ChunkRepresentation, ChunkShape, DataType, + RawBytes, }; -use super::{ArrayBytes, ArrayBytesFixedDisjointView, RawBytes}; /// A codec plugin. pub type CodecPlugin = Plugin; @@ -1060,6 +1060,9 @@ pub enum CodecError { /// Subset out of bounds. #[error(transparent)] SubsetOutOfBounds(#[from] SubsetOutOfBoundsError), + /// Invalid byte offsets for variable length data. + #[error(transparent)] + RawBytesOffsetsCreate(#[from] RawBytesOffsetsCreateError), } impl From<&str> for CodecError { diff --git a/zarrs/src/array/codec/array_to_array/transpose.rs b/zarrs/src/array/codec/array_to_array/transpose.rs index 7818ccf7..0276fd76 100644 --- a/zarrs/src/array/codec/array_to_array/transpose.rs +++ b/zarrs/src/array/codec/array_to_array/transpose.rs @@ -120,6 +120,10 @@ fn transpose_vlen<'a>( bytes_new.extend_from_slice(&bytes[curr..next]); } offsets_new.push(bytes_new.len()); + let offsets_new = unsafe { + // SAFETY: The offsets are monotonically increasing. + RawBytesOffsets::new_unchecked(offsets_new) + }; ArrayBytes::new_vlen(bytes_new, offsets_new) } diff --git a/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs b/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs index cc3c44e2..28956d89 100644 --- a/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs +++ b/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs @@ -9,7 +9,7 @@ use crate::{ CodecOptions, CodecTraits, RecommendedConcurrency, }, transmute_to_bytes_vec, ArrayBytes, BytesRepresentation, ChunkRepresentation, CodecChain, - DataType, DataTypeSize, Endianness, FillValue, RawBytes, + DataType, DataTypeSize, Endianness, FillValue, RawBytes, RawBytesOffsets, }, config::global_config, metadata::v3::{array::codec::vlen::VlenIndexDataType, MetadataV3}, @@ -265,14 +265,16 @@ impl ArrayToBytesCodecTraits for VlenCodec { } } .unwrap(); - let (data, index) = super::get_vlen_bytes_and_offsets( + let (data, offsets) = super::get_vlen_bytes_and_offsets( &index_chunk_rep, &bytes, &self.index_codecs, &self.data_codecs, options, )?; - Ok(ArrayBytes::new_vlen(data, index)) + let offsets = RawBytesOffsets::new(offsets)?; + + Ok(ArrayBytes::new_vlen(data, offsets)) } fn partial_decoder( diff --git a/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs b/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs index def1a44d..0970a8fa 100644 --- a/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs +++ b/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs @@ -11,6 +11,7 @@ use crate::{ RecommendedConcurrency, }, ArrayBytes, BytesRepresentation, ChunkRepresentation, DataTypeSize, RawBytes, + RawBytesOffsets, }, config::global_config, metadata::v3::MetadataV3, @@ -110,6 +111,7 @@ impl ArrayToBytesCodecTraits for VlenV2Codec { ) -> Result, CodecError> { let num_elements = decoded_representation.num_elements_usize(); let (bytes, offsets) = super::get_interleaved_bytes_and_offsets(num_elements, &bytes)?; + let offsets = RawBytesOffsets::new(offsets)?; Ok(ArrayBytes::new_vlen(bytes, offsets)) } diff --git a/zarrs/src/array/element.rs b/zarrs/src/array/element.rs index c28c8612..b2949898 100644 --- a/zarrs/src/array/element.rs +++ b/zarrs/src/array/element.rs @@ -3,7 +3,9 @@ use std::mem::ManuallyDrop; use itertools::Itertools; use ArrayError::IncompatibleElementType as IET; -use super::{convert_from_bytes_slice, transmute_to_bytes, ArrayBytes, ArrayError, DataType}; +use super::{ + convert_from_bytes_slice, transmute_to_bytes, ArrayBytes, ArrayError, DataType, RawBytesOffsets, +}; /// A trait representing an array element type. pub trait Element: Sized + Clone { @@ -184,6 +186,10 @@ macro_rules! impl_element_string { len = len.checked_add(element.len()).unwrap(); } offsets.push(len); + let offsets = unsafe { + // SAFETY: The offsets are monotonically increasing. + RawBytesOffsets::new_unchecked(offsets) + }; // Concatenate bytes let mut bytes = Vec::with_capacity(usize::try_from(len).unwrap()); @@ -238,6 +244,10 @@ macro_rules! impl_element_binary { len = len.checked_add(element.len()).unwrap(); } offsets.push(len); + let offsets = unsafe { + // SAFETY: The offsets are monotonically increasing. + RawBytesOffsets::new_unchecked(offsets) + }; // Concatenate bytes let bytes = elements.concat(); From 4c2d73c7c4fef3b9cfee4bc3c89b683e310f44a8 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Thu, 23 Jan 2025 09:31:03 +1100 Subject: [PATCH 24/45] refactor!: validate `ArrayBytes::Variable` (#138) --- CHANGELOG.md | 3 +- zarrs/src/array.rs | 2 +- zarrs/src/array/array_bytes.rs | 92 ++++++++++++++++--- .../array/array_bytes/raw_bytes_offsets.rs | 30 +++++- zarrs/src/array/codec.rs | 4 + .../array/codec/array_to_array/transpose.rs | 7 +- .../codec/array_to_bytes/vlen/vlen_codec.rs | 6 +- .../array_to_bytes/vlen_v2/vlen_v2_codec.rs | 3 +- zarrs/src/array/element.rs | 12 ++- 9 files changed, 132 insertions(+), 27 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 606b7eef..acc6b1a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,7 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Breaking**: change `ArraySubset::inbounds` to take another subset rather than a shape - **Breaking**: `CodecError` enum changes: - Change `CodecError::UnexpectedChunkDecodedSize` to an `InvalidBytesLengthError` - - Add `CodecError::{InvalidArrayShape,InvalidNumberOfElements,SubsetOutOfBounds}` + - Add `CodecError::{InvalidArrayShape,InvalidNumberOfElements,SubsetOutOfBounds,RawBytesOffsetsCreate,RawBytesOffsetsOutOfBounds}` - **Breaking**: Change output args to `ArrayBytesFixedDisjointView` and make safe the following: - `Array::[async_]retrieve_chunk[_subset]_into` - `[Async]ArrayPartialDecoderTraits::partial_decode_into` @@ -24,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `zarrs::array::copy_fill_value_into` - `zarrs::array::update_array_bytes` - **Breaking**: change `RawBytesOffsets` into a validated newtype +- **Breaking**: `ArrayBytes::new_vlen()` not returns a `Result` and validates bytes/offsets compatibility ## [0.19.1] - 2025-01-19 diff --git a/zarrs/src/array.rs b/zarrs/src/array.rs index 1f7b17f6..0d5cd65a 100644 --- a/zarrs/src/array.rs +++ b/zarrs/src/array.rs @@ -49,7 +49,7 @@ pub use self::{ array_builder::ArrayBuilder, array_bytes::{ copy_fill_value_into, update_array_bytes, ArrayBytes, ArrayBytesError, RawBytes, - RawBytesOffsets, RawBytesOffsetsCreateError, + RawBytesOffsets, RawBytesOffsetsCreateError, RawBytesOffsetsOutOfBoundsError, }, array_bytes_fixed_disjoint_view::{ ArrayBytesFixedDisjointView, ArrayBytesFixedDisjointViewCreateError, diff --git a/zarrs/src/array/array_bytes.rs b/zarrs/src/array/array_bytes.rs index 498d66b2..42210075 100644 --- a/zarrs/src/array/array_bytes.rs +++ b/zarrs/src/array/array_bytes.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; +use derive_more::derive::Display; use itertools::Itertools; use thiserror::Error; use unsafe_cell_slice::UnsafeCellSlice; @@ -38,9 +39,18 @@ pub enum ArrayBytes<'a> { /// The bytes and offsets are modeled on the [Apache Arrow Variable-size Binary Layout](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout). /// - The offsets buffer contains length + 1 ~~signed integers (either 32-bit or 64-bit, depending on the data type)~~ usize integers. /// - Offsets must be monotonically increasing, that is `offsets[j+1] >= offsets[j]` for `0 <= j < length`, even for null slots. Thus, the bytes represent C-contiguous elements with padding permitted. + /// - The final offset must be less than or equal to the length of the bytes buffer. Variable(RawBytes<'a>, RawBytesOffsets<'a>), } +/// An error raised if variable length array bytes offsets are out of bounds. +#[derive(Debug, Error, Display)] +#[display("Offset {offset} is out of bounds for bytes of length {len}")] +pub struct RawBytesOffsetsOutOfBoundsError { + offset: usize, + len: usize, +} + /// Errors related to [`ArrayBytes<'_>`] and [`ArrayBytes`]. #[derive(Debug, Error)] pub enum ArrayBytesError { @@ -58,8 +68,35 @@ impl<'a> ArrayBytes<'a> { } /// Create a new variable length array bytes from `bytes` and `offsets`. - pub fn new_vlen(bytes: impl Into>, offsets: RawBytesOffsets<'a>) -> Self { - Self::Variable(bytes.into(), offsets) + /// + /// # Errors + /// Returns a [`RawBytesOffsetsOutOfBoundsError`] if the last offset is out of bounds of the bytes. + pub fn new_vlen( + bytes: impl Into>, + offsets: RawBytesOffsets<'a>, + ) -> Result { + let bytes = bytes.into(); + if offsets.last() <= bytes.len() { + Ok(Self::Variable(bytes, offsets)) + } else { + Err(RawBytesOffsetsOutOfBoundsError { + offset: offsets.last(), + len: bytes.len(), + }) + } + } + + /// Create a new variable length array bytes from `bytes` and `offsets` without checking the offsets. + /// + /// # Safety + /// The last offset must be less than or equal to the length of the bytes. + pub unsafe fn new_vlen_unchecked( + bytes: impl Into>, + offsets: RawBytesOffsets<'a>, + ) -> Self { + let bytes = bytes.into(); + debug_assert!(offsets.last() <= bytes.len()); + Self::Variable(bytes, offsets) } /// Create a new [`ArrayBytes`] with `num_elements` composed entirely of the `fill_value`. @@ -78,14 +115,18 @@ impl<'a> ArrayBytes<'a> { } ArraySize::Variable { num_elements } => { let num_elements = usize::try_from(num_elements).unwrap(); - Self::new_vlen(fill_value.as_ne_bytes().repeat(num_elements), unsafe { + let offsets = unsafe { // SAFETY: The offsets are monotonically increasing. RawBytesOffsets::new_unchecked( (0..=num_elements) .map(|i| i * fill_value.size()) .collect::>(), ) - }) + }; + unsafe { + // SAFETY: The last offset is equal to the length of the bytes + Self::new_vlen_unchecked(fill_value.as_ne_bytes().repeat(num_elements), offsets) + } } } } @@ -135,9 +176,9 @@ impl<'a> ArrayBytes<'a> { #[must_use] pub fn into_owned<'b>(self) -> ArrayBytes<'b> { match self { - Self::Fixed(bytes) => ArrayBytes::<'b>::new_flen(bytes.into_owned()), + Self::Fixed(bytes) => ArrayBytes::<'b>::Fixed(bytes.into_owned().into()), Self::Variable(bytes, offsets) => { - ArrayBytes::<'b>::new_vlen(bytes.into_owned(), offsets.into_owned()) + ArrayBytes::<'b>::Variable(bytes.into_owned().into(), offsets.into_owned()) } } } @@ -206,7 +247,11 @@ impl<'a> ArrayBytes<'a> { // SAFETY: The offsets are monotonically increasing. RawBytesOffsets::new_unchecked(ss_offsets) }; - Ok(ArrayBytes::new_vlen(ss_bytes, ss_offsets)) + let array_bytes = unsafe { + // SAFETY: The last offset is equal to the length of the bytes + ArrayBytes::new_vlen_unchecked(ss_bytes, ss_offsets) + }; + Ok(array_bytes) } ArrayBytes::Fixed(bytes) => { let byte_ranges = @@ -337,8 +382,11 @@ pub(crate) fn update_bytes_vlen<'a>( // SAFETY: The offsets are monotonically increasing. RawBytesOffsets::new_unchecked(offsets_new) }; - - Ok(ArrayBytes::new_vlen(bytes_new, offsets_new)) + let array_bytes = unsafe { + // SAFETY: The last offset is equal to the length of the bytes + ArrayBytes::new_vlen_unchecked(bytes_new, offsets_new) + }; + Ok(array_bytes) } /// Update a subset of an array. @@ -448,7 +496,7 @@ pub(crate) fn merge_chunks_vlen<'a>( // Write bytes // TODO: Go parallel - let mut bytes = vec![0; *offsets.last().unwrap()]; + let mut bytes = vec![0; offsets.last()]; for (chunk_bytes, chunk_subset) in chunk_bytes_and_subsets { let (chunk_bytes, chunk_offsets) = chunk_bytes.into_variable()?; let indices = chunk_subset.linearised_indices(array_shape).unwrap(); @@ -462,7 +510,12 @@ pub(crate) fn merge_chunks_vlen<'a>( } } - Ok(ArrayBytes::new_vlen(bytes, offsets)) + let array_bytes = unsafe { + // SAFETY: The last offset is equal to the length of the bytes + ArrayBytes::new_vlen_unchecked(bytes, offsets) + }; + + Ok(array_bytes) } pub(crate) fn extract_decoded_regions_vlen<'a>( @@ -496,7 +549,11 @@ pub(crate) fn extract_decoded_regions_vlen<'a>( // SAFETY: The offsets are monotonically increasing. RawBytesOffsets::new_unchecked(region_offsets) }; - out.push(ArrayBytes::new_vlen(region_bytes, region_offsets)); + let array_bytes = unsafe { + // SAFETY: The last offset is equal to the length of the bytes + ArrayBytes::new_vlen_unchecked(region_bytes, region_offsets) + }; + out.push(array_bytes); } Ok(out) } @@ -599,6 +656,17 @@ mod tests { Ok(()) } + #[test] + fn array_bytes_vlen() { + let data = [0u8, 1, 2, 3, 4]; + assert!(ArrayBytes::new_vlen(&data, vec![0].try_into().unwrap()).is_ok()); + assert!(ArrayBytes::new_vlen(&data, vec![0, 5].try_into().unwrap()).is_ok()); + assert!(ArrayBytes::new_vlen(&data, vec![0, 5, 5].try_into().unwrap()).is_ok()); + assert!(ArrayBytes::new_vlen(&data, vec![0, 5, 6].try_into().unwrap()).is_err()); + assert!(ArrayBytes::new_vlen(&data, vec![0, 1, 3, 5].try_into().unwrap()).is_ok()); + assert!(ArrayBytes::new_vlen(&data, vec![0, 1, 3, 6].try_into().unwrap()).is_err()); + } + #[test] fn array_bytes_str() -> Result<(), Box> { let data = ["a", "bb", "ccc"]; diff --git a/zarrs/src/array/array_bytes/raw_bytes_offsets.rs b/zarrs/src/array/array_bytes/raw_bytes_offsets.rs index 4b335014..3de578f3 100644 --- a/zarrs/src/array/array_bytes/raw_bytes_offsets.rs +++ b/zarrs/src/array/array_bytes/raw_bytes_offsets.rs @@ -18,10 +18,15 @@ impl Deref for RawBytesOffsets<'_> { } /// An error creating [`RawBytesOffsets`]. -/// -/// This error occurs when the offsets are not monotonically increasing. #[derive(Debug, Error, Display)] -pub struct RawBytesOffsetsCreateError; +pub enum RawBytesOffsetsCreateError { + /// The offsets length must be greater than zero. + #[display("offsets length must be greater than zero")] + ZeroLength, + /// The offsets are not monotonically increasing. + #[display("offsets are not monotonically increasing")] + NotMonotonicallyIncreasing, +} impl<'a> RawBytesOffsets<'a> { /// Creates a new `RawBytesOffsets`. @@ -30,10 +35,12 @@ impl<'a> RawBytesOffsets<'a> { /// Returns an error if the offsets are not monotonically increasing. pub fn new(offsets: impl Into>) -> Result { let offsets = offsets.into(); - if offsets.windows(2).all(|w| w[1] >= w[0]) { + if offsets.is_empty() { + Err(RawBytesOffsetsCreateError::ZeroLength) + } else if offsets.windows(2).all(|w| w[1] >= w[0]) { Ok(Self(offsets)) } else { - Err(RawBytesOffsetsCreateError) + Err(RawBytesOffsetsCreateError::NotMonotonicallyIncreasing) } } @@ -44,6 +51,7 @@ impl<'a> RawBytesOffsets<'a> { #[must_use] pub unsafe fn new_unchecked(offsets: impl Into>) -> Self { let offsets = offsets.into(); + debug_assert!(!offsets.is_empty()); debug_assert!(offsets.windows(2).all(|w| w[1] >= w[0])); Self(offsets) } @@ -53,6 +61,15 @@ impl<'a> RawBytesOffsets<'a> { pub fn into_owned(self) -> RawBytesOffsets<'static> { RawBytesOffsets(self.0.into_owned().into()) } + + /// Returns the last offset. + #[must_use] + pub fn last(&self) -> usize { + unsafe { + // SAFETY: The offsets cannot be empty. + *self.0.last().unwrap_unchecked() + } + } } impl<'a> TryFrom> for RawBytesOffsets<'a> { @@ -95,6 +112,9 @@ mod tests { fn raw_bytes_offsets() { let offsets = RawBytesOffsets::new(vec![0, 1, 2, 3]).unwrap(); assert_eq!(&*offsets, &[0, 1, 2, 3]); + assert!(RawBytesOffsets::new(vec![]).is_err()); + assert!(RawBytesOffsets::new(vec![0]).is_ok()); + assert!(RawBytesOffsets::new(vec![10]).is_ok()); // nonsense, but not invalid assert!(RawBytesOffsets::new(vec![0, 1, 1]).is_ok()); assert!(RawBytesOffsets::new(vec![0, 1, 0]).is_err()); assert!(RawBytesOffsets::try_from(vec![0, 1, 2]).is_ok()); diff --git a/zarrs/src/array/codec.rs b/zarrs/src/array/codec.rs index a520f2c5..f5adeed0 100644 --- a/zarrs/src/array/codec.rs +++ b/zarrs/src/array/codec.rs @@ -97,6 +97,7 @@ use std::any::Any; use std::borrow::Cow; use std::sync::Arc; +use super::RawBytesOffsetsOutOfBoundsError; use super::{ array_bytes::RawBytesOffsetsCreateError, concurrency::RecommendedConcurrency, ArrayBytes, ArrayBytesFixedDisjointView, BytesRepresentation, ChunkRepresentation, ChunkShape, DataType, @@ -1063,6 +1064,9 @@ pub enum CodecError { /// Invalid byte offsets for variable length data. #[error(transparent)] RawBytesOffsetsCreate(#[from] RawBytesOffsetsCreateError), + /// Variable length array bytes offsets are out of bounds. + #[error(transparent)] + RawBytesOffsetsOutOfBounds(#[from] RawBytesOffsetsOutOfBoundsError), } impl From<&str> for CodecError { diff --git a/zarrs/src/array/codec/array_to_array/transpose.rs b/zarrs/src/array/codec/array_to_array/transpose.rs index 0276fd76..8e0f1ec5 100644 --- a/zarrs/src/array/codec/array_to_array/transpose.rs +++ b/zarrs/src/array/codec/array_to_array/transpose.rs @@ -124,8 +124,11 @@ fn transpose_vlen<'a>( // SAFETY: The offsets are monotonically increasing. RawBytesOffsets::new_unchecked(offsets_new) }; - - ArrayBytes::new_vlen(bytes_new, offsets_new) + let array_bytes = unsafe { + // SAFETY: The last offset is equal to the length of the bytes + ArrayBytes::new_vlen_unchecked(bytes_new, offsets_new) + }; + array_bytes } #[cfg(test)] diff --git a/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs b/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs index 28956d89..9e0a23a3 100644 --- a/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs +++ b/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs @@ -265,7 +265,7 @@ impl ArrayToBytesCodecTraits for VlenCodec { } } .unwrap(); - let (data, offsets) = super::get_vlen_bytes_and_offsets( + let (bytes, offsets) = super::get_vlen_bytes_and_offsets( &index_chunk_rep, &bytes, &self.index_codecs, @@ -273,8 +273,8 @@ impl ArrayToBytesCodecTraits for VlenCodec { options, )?; let offsets = RawBytesOffsets::new(offsets)?; - - Ok(ArrayBytes::new_vlen(data, offsets)) + let array_bytes = ArrayBytes::new_vlen(bytes, offsets)?; + Ok(array_bytes) } fn partial_decoder( diff --git a/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs b/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs index 0970a8fa..e0bcbef3 100644 --- a/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs +++ b/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs @@ -112,7 +112,8 @@ impl ArrayToBytesCodecTraits for VlenV2Codec { let num_elements = decoded_representation.num_elements_usize(); let (bytes, offsets) = super::get_interleaved_bytes_and_offsets(num_elements, &bytes)?; let offsets = RawBytesOffsets::new(offsets)?; - Ok(ArrayBytes::new_vlen(bytes, offsets)) + let array_bytes = ArrayBytes::new_vlen(bytes, offsets)?; + Ok(array_bytes) } fn partial_decoder( diff --git a/zarrs/src/array/element.rs b/zarrs/src/array/element.rs index b2949898..aa826358 100644 --- a/zarrs/src/array/element.rs +++ b/zarrs/src/array/element.rs @@ -196,7 +196,11 @@ macro_rules! impl_element_string { for element in elements { bytes.extend_from_slice(element.as_bytes()); } - Ok(ArrayBytes::new_vlen(bytes, offsets)) + let array_bytes = unsafe { + // SAFETY: The last offset is the length of the bytes. + ArrayBytes::new_vlen_unchecked(bytes, offsets) + }; + Ok(array_bytes) } } }; @@ -252,7 +256,11 @@ macro_rules! impl_element_binary { // Concatenate bytes let bytes = elements.concat(); - Ok(ArrayBytes::new_vlen(bytes, offsets)) + let array_bytes = unsafe { + // SAFETY: The last offset is the length of the bytes. + ArrayBytes::new_vlen_unchecked(bytes, offsets) + }; + Ok(array_bytes) } } }; From 5a71ac9ab1c5b286cc324cbfe15a8599f456ebe3 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Fri, 24 Jan 2025 07:11:21 +1100 Subject: [PATCH 25/45] fix: Reenable broken compatibility tests since fixed in `zarr-python`/`numcodecs` --- CHANGELOG.md | 1 + zarrs/src/array.rs | 4 +- zarrs/tests/data/v2/array_blosc_F.zarr/0.0 | Bin 116 -> 116 bytes zarrs/tests/data/v2/array_blosc_F.zarr/0.1 | Bin 116 -> 116 bytes zarrs/tests/data/v2/array_blosc_F.zarr/1.0 | Bin 116 -> 116 bytes zarrs/tests/data/v2/array_blosc_F.zarr/1.1 | Bin 116 -> 116 bytes zarrs/tests/data/v2/array_gzip_C.zarr/0.0 | Bin 83 -> 83 bytes zarrs/tests/data/v2/array_gzip_C.zarr/0.1 | Bin 83 -> 83 bytes zarrs/tests/data/v2/array_gzip_C.zarr/1.0 | Bin 81 -> 81 bytes zarrs/tests/data/v2/array_gzip_C.zarr/1.1 | Bin 79 -> 79 bytes zarrs/tests/data/v2/array_none_F.zarr/0.0 | Bin 100 -> 100 bytes zarrs/tests/data/v2/array_none_F.zarr/0.1 | Bin 100 -> 100 bytes zarrs/tests/data/v2/array_none_F.zarr/1.0 | Bin 100 -> 100 bytes zarrs/tests/data/v2/array_none_F.zarr/1.1 | Bin 100 -> 100 bytes zarrs/tests/data/v2_generate.py | 11 +++++ .../v3/array_blosc_transpose.zarr/.zarray | 6 +-- .../data/v3/array_none_transpose.zarr/.zarray | 18 +++++++ .../data/v3/array_none_transpose.zarr/.zattrs | 3 ++ .../data/v3/array_none_transpose.zarr/0.0 | Bin 0 -> 100 bytes .../data/v3/array_none_transpose.zarr/0.1 | Bin 0 -> 100 bytes .../data/v3/array_none_transpose.zarr/1.0 | Bin 0 -> 100 bytes .../data/v3/array_none_transpose.zarr/1.1 | Bin 0 -> 100 bytes .../v3/array_none_transpose.zarr/zarr.json | 45 ++++++++++++++++++ zarrs/tests/data/v3_generate.py | 15 +++++- .../data/v3_zarr_python/array_gzip.zarr/c/0/0 | Bin 83 -> 83 bytes .../data/v3_zarr_python/array_gzip.zarr/c/0/1 | Bin 83 -> 83 bytes .../data/v3_zarr_python/array_gzip.zarr/c/1/0 | Bin 81 -> 81 bytes .../data/v3_zarr_python/array_gzip.zarr/c/1/1 | Bin 79 -> 79 bytes 28 files changed, 95 insertions(+), 8 deletions(-) mode change 100644 => 100755 zarrs/tests/data/v2_generate.py create mode 100644 zarrs/tests/data/v3/array_none_transpose.zarr/.zarray create mode 100644 zarrs/tests/data/v3/array_none_transpose.zarr/.zattrs create mode 100644 zarrs/tests/data/v3/array_none_transpose.zarr/0.0 create mode 100644 zarrs/tests/data/v3/array_none_transpose.zarr/0.1 create mode 100644 zarrs/tests/data/v3/array_none_transpose.zarr/1.0 create mode 100644 zarrs/tests/data/v3/array_none_transpose.zarr/1.1 create mode 100644 zarrs/tests/data/v3/array_none_transpose.zarr/zarr.json mode change 100644 => 100755 zarrs/tests/data/v3_generate.py diff --git a/CHANGELOG.md b/CHANGELOG.md index acc6b1a1..1457d95d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `zarrs::array::update_array_bytes` - **Breaking**: change `RawBytesOffsets` into a validated newtype - **Breaking**: `ArrayBytes::new_vlen()` not returns a `Result` and validates bytes/offsets compatibility +- Reenable broken compatibility tests since fixed in `zarr-python`/`numcodecs` ## [0.19.1] - 2025-01-19 diff --git a/zarrs/src/array.rs b/zarrs/src/array.rs index 0d5cd65a..b3cc4333 100644 --- a/zarrs/src/array.rs +++ b/zarrs/src/array.rs @@ -1103,12 +1103,11 @@ mod tests { ) } - #[ignore] // FIXME: Reported upstream https://github.com/zarr-developers/zarr-python/issues/2675 #[test] fn array_v2_none_f() { array_v2_to_v3( "tests/data/v2/array_none_F.zarr", - "tests/data/v3/array_none_tranpose.zarr", + "tests/data/v3/array_none_transpose.zarr", ) } @@ -1123,7 +1122,6 @@ mod tests { } #[cfg(feature = "blosc")] - #[ignore] // FIXME: Reported upstream https://github.com/zarr-developers/zarr-python/issues/2675 #[test] #[cfg_attr(miri, ignore)] fn array_v2_blosc_f() { diff --git a/zarrs/tests/data/v2/array_blosc_F.zarr/0.0 b/zarrs/tests/data/v2/array_blosc_F.zarr/0.0 index 404ac43c49420e1c968d07e59a97740f8de210e9..f41011460aff3d967493e5626c45e007758935a6 100644 GIT binary patch literal 116 zcmZQ#oW_#Ez`y{)B|r=T3XTj63xN0o5Gyz_Ff`aRFc<*YD}eY15UT*i7#x5I$ld_N k3_!jH5IX?H0)YG-K+FQ<>i}^BP%Ht+KLErWK)!(!0Iv%Wp#T5? literal 116 zcmXBFp%H)}0EOYFM_>$sfnZ<*kO>@tzz7b&6lSB*Xmli_(ckmk|K01m)l^BJ1^-&) e6QRLPQb59tf(<(koVak~fwE*^;V{5sM943qyAYuO diff --git a/zarrs/tests/data/v2/array_blosc_F.zarr/0.1 b/zarrs/tests/data/v2/array_blosc_F.zarr/0.1 index 91cfc08883f5cb9d03b24b790619586b874738b9..4343539f204c43f512a6ef078f590129e08b5371 100644 GIT binary patch literal 116 zcmZQ#oW_#Ez`y{)B|yx;u)u+Vp}>)W;RF!#I59Ao0Pz8!SObuM0f+^Fd}Hhon<1TE gx{#njsxe`~iVX*DJb00iQ3XCM8aTA*(8D8!4|p3A@&Et; diff --git a/zarrs/tests/data/v2/array_blosc_F.zarr/1.0 b/zarrs/tests/data/v2/array_blosc_F.zarr/1.0 index 8f42c6fe5aa7be0ae0859ab494a9853ea9f7321f..823b486ae1d2ca19f0b6c6a9b05ca8bd19dc7013 100644 GIT binary patch literal 116 zcmWl|u?c`M07b!X$YiN3EG$wufCE^hu&~GgA|hffBH{odA`a)v!I^BawWKsq%AM4( e=gNx(0|#!rS+eEGoew=bPCWRsV&u$|pZfufr4`Ts literal 116 zcmWl|u?c`M07b!X$YiN3EG$wufCE^hu&~GgA|hffBH{odA`a)v!I^BawWKsq%AHiO eq-Vv3fh{{m_8d5J;>?vBcOE=>@#e#qpZftur4`Ts diff --git a/zarrs/tests/data/v2/array_blosc_F.zarr/1.1 b/zarrs/tests/data/v2/array_blosc_F.zarr/1.1 index 55c1b71f553bc0b0dc22ed43f0597e053ebe097e..26bceb7a4ee707ced926d9b38f3a7363e0324970 100644 GIT binary patch literal 116 zcmWl|u?c`M07b!XaI(x{ks^hKr4xt*$pAq_96&_G0Yrj0m(PPUMJm0de@m&dQpd!V b7ae=_sgfb11O`~!$pfMN^|Km=rO0AdCpUjv98fMNka a{th5!0rGW#xB)1Z0OTJ4Vh$kRzzG23@earU literal 100 zcmWl}p%s8I5JbTf2+D{+AQ0(*kP0Y)Qc?;Mi9}Ko63MXd&qn-fvn!&JmWWJYWn<^y S={7(0*GaRd>1DGbKwyg literal 100 zcmWN{p&39x7y!_J06`E0V+ew9?lX`PL{SutB8sAT(HLHM?M0j^;-U~!M9sp=#?Hab S!%N|#%03M(9X$ghi8=qR0TCGh diff --git a/zarrs/tests/data/v2/array_none_F.zarr/1.0 b/zarrs/tests/data/v2/array_none_F.zarr/1.0 index b15fa039470372068797f1b20abcec026ddf2d36..638a99a9b052af0352ffc268e0d0665a47d58efa 100644 GIT binary patch literal 100 zcmWN=F$zFn7y!`kZ2x317z~bKFc=IDAW04&Ne&=M4(Gjk5m}0;9bCQ4Z5-XaEo`0K QeJt&qJ$$X~T|E8L55bEQ_W%F@ literal 100 zcmWN=F$zFn7y!`kZ2x317z~bKFc=IDAW04&Ne&=M4(Gjk5m}1JEiA39ZEWrA9UPsU QU0mJVJv_a2L}gJNRk00$pDgM07){J_v%GtDWZ08^)k0{bn~{bb#nKy Ov~%|GwX%2d^h-YjiWN5i literal 100 zcmWN=!3jWM7y!_3wto%>2L}gJNRk00$pDgM07){J_v%GtDI&M9w6eCbwX=6{baHla Ob#wRd^z!!c^-Dj)yA?M8 diff --git a/zarrs/tests/data/v2_generate.py b/zarrs/tests/data/v2_generate.py old mode 100644 new mode 100755 index caad33e6..480c1867 --- a/zarrs/tests/data/v2_generate.py +++ b/zarrs/tests/data/v2_generate.py @@ -1,3 +1,14 @@ +#!/usr/bin/env -S uv run +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "zarr==3.0.1", +# "numcodecs==0.15.0", +# "zfpy==1.0.1", +# "pcodec==0.3.2", +# ] +# /// + import zarr import numpy as np from numcodecs import Blosc, GZip, BZ2, ZFPY, PCodec, Zstd diff --git a/zarrs/tests/data/v3/array_blosc_transpose.zarr/.zarray b/zarrs/tests/data/v3/array_blosc_transpose.zarr/.zarray index d5bdb34a..8bfa6f0e 100644 --- a/zarrs/tests/data/v3/array_blosc_transpose.zarr/.zarray +++ b/zarrs/tests/data/v3/array_blosc_transpose.zarr/.zarray @@ -12,10 +12,10 @@ "dtype": "=_sgfb11O`~!$pfMN^|Km=rO0AdCpUjv98fMNka a{th5!0rGW#xB)1Z0OTJ4Vh$kRzzG23@earU literal 0 HcmV?d00001 diff --git a/zarrs/tests/data/v3/array_none_transpose.zarr/0.1 b/zarrs/tests/data/v3/array_none_transpose.zarr/0.1 new file mode 100644 index 0000000000000000000000000000000000000000..2bc6647607bbd193b5a2c58074dee174346a34f0 GIT binary patch literal 100 zcmZQzSm40GP~gbGZ~};VoER8PfcOAVtO3Zs0K@`7z6B6J0E%@0`8R-A1jx4mVg?|4 b0uVm{VhJGM0f+^F>={7(0*GaRd>1DGbKwyg literal 0 HcmV?d00001 diff --git a/zarrs/tests/data/v3/array_none_transpose.zarr/1.0 b/zarrs/tests/data/v3/array_none_transpose.zarr/1.0 new file mode 100644 index 0000000000000000000000000000000000000000..638a99a9b052af0352ffc268e0d0665a47d58efa GIT binary patch literal 100 zcmWN=F$zFn7y!`kZ2x317z~bKFc=IDAW04&Ne&=M4(Gjk5m}0;9bCQ4Z5-XaEo`0K QeJt&qJ$$X~T|E8L55bEQ_W%F@ literal 0 HcmV?d00001 diff --git a/zarrs/tests/data/v3/array_none_transpose.zarr/1.1 b/zarrs/tests/data/v3/array_none_transpose.zarr/1.1 new file mode 100644 index 0000000000000000000000000000000000000000..7df5783a011f3b83937d238095f38f46717a34bd GIT binary patch literal 100 zcmWN=!3jWM7y!_3wto%>2L}gJNRk00$pDgM07){J_v%GtDWZ08^)k0{bn~{bb#nKy Ov~%|GwX%2d^h-YjiWN5i literal 0 HcmV?d00001 diff --git a/zarrs/tests/data/v3/array_none_transpose.zarr/zarr.json b/zarrs/tests/data/v3/array_none_transpose.zarr/zarr.json new file mode 100644 index 00000000..ea008be9 --- /dev/null +++ b/zarrs/tests/data/v3/array_none_transpose.zarr/zarr.json @@ -0,0 +1,45 @@ +{ + "zarr_format": 3, + "node_type": "array", + "shape": [ + 10, + 10 + ], + "data_type": "float32", + "chunk_grid": { + "name": "regular", + "configuration": { + "chunk_shape": [ + 5, + 5 + ] + } + }, + "chunk_key_encoding": { + "name": "v2", + "configuration": { + "separator": "." + } + }, + "fill_value": 0.0, + "codecs": [ + { + "name": "transpose", + "configuration": { + "order": [ + 1, + 0 + ] + } + }, + { + "name": "bytes", + "configuration": { + "endian": "little" + } + } + ], + "attributes": { + "key": "value" + } +} \ No newline at end of file diff --git a/zarrs/tests/data/v3_generate.py b/zarrs/tests/data/v3_generate.py old mode 100644 new mode 100755 index 111cb3a4..3cffdcf7 --- a/zarrs/tests/data/v3_generate.py +++ b/zarrs/tests/data/v3_generate.py @@ -1,6 +1,17 @@ -import zarr # 3.0.0 +#!/usr/bin/env -S uv run +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "zarr==3.0.0", +# "numcodecs @ git+https://github.com/zarr-developers/numcodecs.git@8d15c02", # > 0.15.0 +# "zfpy==1.0.1", +# "pcodec==0.3.2", +# ] +# /// + +import zarr import numpy as np -from numcodecs.zarr3 import BZ2, ZFPY, PCodec # 0.14.2.dev22 with https://github.com/zarr-developers/numcodecs/pull/685 +from numcodecs.zarr3 import BZ2, ZFPY, PCodec compressor_blosc = zarr.codecs.BloscCodec(cname="zstd", clevel=1, shuffle=zarr.codecs.BloscShuffle.bitshuffle) compressor_gzip = zarr.codecs.GzipCodec(level=9) diff --git a/zarrs/tests/data/v3_zarr_python/array_gzip.zarr/c/0/0 b/zarrs/tests/data/v3_zarr_python/array_gzip.zarr/c/0/0 index 5a9f93cb7bbcf6a443296ac8fefc075413aaac96..ed30970466ce66bc4a6332be7e17b9e4368f2eb0 100644 GIT binary patch delta 13 UcmWFzW|!~gU^v$@aU#1902+q`CjbBd delta 13 UcmWFzW|!~gV7RH%Fp=E{02s{!$p8QV diff --git a/zarrs/tests/data/v3_zarr_python/array_gzip.zarr/c/0/1 b/zarrs/tests/data/v3_zarr_python/array_gzip.zarr/c/0/1 index 64597638de76b2f06d80951f3a9231ff9eb938a7..49038743a291bd198539210c1a931a72ec24448c 100644 GIT binary patch delta 13 UcmWFzW|!~gU^v$@aU#1902+q`CjbBd delta 13 UcmWFzW|!~gV7RH%Fp=E{02s{!$p8QV diff --git a/zarrs/tests/data/v3_zarr_python/array_gzip.zarr/c/1/0 b/zarrs/tests/data/v3_zarr_python/array_gzip.zarr/c/1/0 index 4543b4eeb79ee911ccb0236206ac48aa25d0532a..c5790a72b2d3ce4928293c5cd6dbe565c230892d 100644 GIT binary patch delta 13 UcmWFxWS8&eU^v$@aU#1H02)&SApigX delta 13 UcmWFxWS8&eV7RH%Fp=F002rAA!vFvP diff --git a/zarrs/tests/data/v3_zarr_python/array_gzip.zarr/c/1/1 b/zarrs/tests/data/v3_zarr_python/array_gzip.zarr/c/1/1 index a99f64d9b70833509a9ecb663012a93be4d06849..cebc0fb0648dc7a5f9faadf7dfeaa5c144813062 100644 GIT binary patch delta 13 UcmebGXP58hU^v$@aU#1102&_z8vp Date: Fri, 24 Jan 2025 10:30:32 +1100 Subject: [PATCH 26/45] fix: `array_v2_none_f` test failure in CI with no default features --- zarrs/src/array.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/zarrs/src/array.rs b/zarrs/src/array.rs index b3cc4333..ff7eb378 100644 --- a/zarrs/src/array.rs +++ b/zarrs/src/array.rs @@ -1103,6 +1103,7 @@ mod tests { ) } + #[cfg(feature = "transpose")] #[test] fn array_v2_none_f() { array_v2_to_v3( From a79f091080a1f1ac79e246ec4e5e4059ad33cfd1 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Fri, 24 Jan 2025 10:37:25 +1100 Subject: [PATCH 27/45] refactor!: move the `zarrs::array::{data_type,fill_value}` modules into the `zarrs_data_type` crate (#139) --- CHANGELOG.md | 1 + Cargo.toml | 12 +++++++ README.md | 2 ++ zarrs/Cargo.toml | 5 +-- zarrs/doc/ecosystem.md | 2 ++ zarrs/doc/status/data_types.md | 36 +++++++++---------- zarrs/src/array.rs | 7 ++-- zarrs/src/array/array_builder.rs | 2 +- zarrs/src/array/array_errors.rs | 11 +++--- zarrs/src/array/array_representation.rs | 3 +- zarrs/src/array/codec/array_to_bytes/bytes.rs | 4 +++ zarrs/src/lib.rs | 1 + zarrs/src/plugin.rs | 3 +- zarrs_data_type/CHANGELOG.md | 17 +++++++++ zarrs_data_type/Cargo.toml | 30 ++++++++++++++++ zarrs_data_type/LICENCE-APACHE | 1 + zarrs_data_type/LICENCE-MIT | 1 + zarrs_data_type/README.md | 15 ++++++++ .../src}/data_type.rs | 4 +-- .../src}/fill_value.rs | 15 ++++++-- zarrs_data_type/src/lib.rs | 10 ++++++ zarrs_metadata/Cargo.toml | 4 +-- 22 files changed, 146 insertions(+), 40 deletions(-) create mode 100644 zarrs_data_type/CHANGELOG.md create mode 100644 zarrs_data_type/Cargo.toml create mode 120000 zarrs_data_type/LICENCE-APACHE create mode 120000 zarrs_data_type/LICENCE-MIT create mode 100644 zarrs_data_type/README.md rename {zarrs/src/array => zarrs_data_type/src}/data_type.rs (99%) rename {zarrs/src/array => zarrs_data_type/src}/fill_value.rs (95%) create mode 100644 zarrs_data_type/src/lib.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 1457d95d..32f0bc81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Breaking**: change `RawBytesOffsets` into a validated newtype - **Breaking**: `ArrayBytes::new_vlen()` not returns a `Result` and validates bytes/offsets compatibility - Reenable broken compatibility tests since fixed in `zarr-python`/`numcodecs` +- **Breaking**: move the `zarrs::array::{data_type,fill_value}` modules into the `zarrs_data_type` crate ## [0.19.1] - 2025-01-19 diff --git a/Cargo.toml b/Cargo.toml index 41825a07..00653ae8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ resolver = "2" members = [ "zarrs", + "zarrs_data_type", "zarrs_metadata", "zarrs_storage", "zarrs_filesystem", @@ -26,6 +27,10 @@ module_name_repetitions = "allow" missing_panics_doc = "warn" missing_errors_doc = "warn" +[workspace.dependencies.zarrs_data_type] +version = "0.1.0" +path = "zarrs_data_type" + [workspace.dependencies.zarrs_metadata] version = "0.3.0" path = "zarrs_metadata" @@ -62,3 +67,10 @@ version = "0.51.0" [workspace.dependencies.zip] version = "2.1.3" + +[workspace.dependencies.half] +version = "2.0.0" +features = ["bytemuck"] + +[workspace.dependencies.num] +version = "0.4.1" diff --git a/README.md b/README.md index 694750fb..31b03404 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,7 @@ println!("{array_ndarray:4}"); ### Core - [`zarrs`]: The core library for manipulating Zarr hierarchies. +- [`zarrs_data_type`]: Zarr data types (re-exported as `zarrs::data_type`). - [`zarrs_metadata`]: Zarr metadata support (re-exported as `zarrs::metadata`). - [`zarrs_storage`]: The storage API for `zarrs` (re-exported as `zarrs::storage`). @@ -134,6 +135,7 @@ Unless you explicitly state otherwise, any contribution intentionally submitted [The `zarrs` Book]: https://book.zarrs.dev [`zarrs`]: https://github.com/LDeakin/zarrs/tree/main/zarrs +[`zarrs_data_type`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_data_type [`zarrs_metadata`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_metadata [`zarrs_storage`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_storage [`zarrs_filesystem`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_filesystem diff --git a/zarrs/Cargo.toml b/zarrs/Cargo.toml index e4cee82d..5afcb988 100644 --- a/zarrs/Cargo.toml +++ b/zarrs/Cargo.toml @@ -52,13 +52,13 @@ derive_more = { version = "1.0.0", features = ["deref", "display", "from"] } flate2 = { version = "1.0.30", optional = true } futures = { version = "0.3.29", optional = true } gdeflate-sys = { version = "0.4.1", optional = true } -half = { version = "2.0.0", features = ["bytemuck"] } +half = { workspace = true } inventory = "0.3.0" itertools = "0.14.0" lru = "0.12.4" moka = { version = "0.12.8", features = ["sync"] } ndarray = { version = ">=0.15.0,<17", optional = true } -num = { version = "0.4.1" } +num = { workspace = true } pco = { version = "0.4.0", optional = true } rayon = "1.10.0" rayon_iter_concurrent_limit = "0.2.0" @@ -68,6 +68,7 @@ thiserror = "2.0.0" thread_local = "1.1.8" unsafe_cell_slice = "0.2.0" zarrs_filesystem = { workspace = true, optional = true } +zarrs_data_type = { workspace = true } zarrs_metadata = { workspace = true } zarrs_storage = { workspace = true } zfp-sys = {version = "0.3.0", features = ["static"], optional = true } diff --git a/zarrs/doc/ecosystem.md b/zarrs/doc/ecosystem.md index b132e0f1..db528a37 100644 --- a/zarrs/doc/ecosystem.md +++ b/zarrs/doc/ecosystem.md @@ -1,5 +1,6 @@ #### Core - [`zarrs`]: The core library for manipulating Zarr hierarchies. +- [`zarrs_data_type`]: Zarr data types (re-exported as `zarrs::data_type`). - [`zarrs_metadata`]: Zarr metadata support (re-exported as `zarrs::metadata`). - [`zarrs_storage`]: The storage API for `zarrs` (re-exported as `zarrs::storage`). @@ -26,6 +27,7 @@ - Benchmarking tools and performance benchmarks of `zarrs`. [`zarrs`]: https://github.com/LDeakin/zarrs/tree/main/zarrs +[`zarrs_data_type`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_data_type [`zarrs_metadata`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_metadata [`zarrs_storage`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_storage [`zarrs_filesystem`]: https://github.com/LDeakin/zarrs/tree/main/zarrs_filesystem diff --git a/zarrs/doc/status/data_types.md b/zarrs/doc/status/data_types.md index af053bf6..2b4c1202 100644 --- a/zarrs/doc/status/data_types.md +++ b/zarrs/doc/status/data_types.md @@ -8,24 +8,24 @@ † Experimental data types are recommended for evaluation only. -[bool]: crate::array::data_type::DataType::Bool -[int8]: crate::array::data_type::DataType::Int8 -[int16]: crate::array::data_type::DataType::Int16 -[int32]: crate::array::data_type::DataType::Int32 -[int64]: crate::array::data_type::DataType::Int64 -[uint8]: crate::array::data_type::DataType::UInt8 -[uint16]: crate::array::data_type::DataType::UInt16 -[uint32]: crate::array::data_type::DataType::UInt32 -[uint64]: crate::array::data_type::DataType::UInt64 -[float16]: crate::array::data_type::DataType::Float16 -[float32]: crate::array::data_type::DataType::Float32 -[float64]: crate::array::data_type::DataType::Float64 -[complex64]: crate::array::data_type::DataType::Complex64 -[complex128]: crate::array::data_type::DataType::Complex128 -[bfloat16]: crate::array::data_type::DataType::BFloat16 -[r* (raw bits)]: crate::array::data_type::DataType::RawBits -[string]: crate::array::data_type::DataType::String -[bytes]: crate::array::data_type::DataType::Bytes +[bool]: crate::data_type::DataType::Bool +[int8]: crate::data_type::DataType::Int8 +[int16]: crate::data_type::DataType::Int16 +[int32]: crate::data_type::DataType::Int32 +[int64]: crate::data_type::DataType::Int64 +[uint8]: crate::data_type::DataType::UInt8 +[uint16]: crate::data_type::DataType::UInt16 +[uint32]: crate::data_type::DataType::UInt32 +[uint64]: crate::data_type::DataType::UInt64 +[float16]: crate::data_type::DataType::Float16 +[float32]: crate::data_type::DataType::Float32 +[float64]: crate::data_type::DataType::Float64 +[complex64]: crate::data_type::DataType::Complex64 +[complex128]: crate::data_type::DataType::Complex128 +[bfloat16]: crate::data_type::DataType::BFloat16 +[r* (raw bits)]: crate::data_type::DataType::RawBits +[string]: crate::data_type::DataType::String +[bytes]: crate::data_type::DataType::Bytes [ZEP0001]: https://zarr.dev/zeps/accepted/ZEP0001.html [zarr-specs #130]: https://github.com/zarr-developers/zarr-specs/issues/130 diff --git a/zarrs/src/array.rs b/zarrs/src/array.rs index ff7eb378..f0ae3f96 100644 --- a/zarrs/src/array.rs +++ b/zarrs/src/array.rs @@ -33,10 +33,9 @@ pub mod chunk_grid; pub mod chunk_key_encoding; pub mod codec; pub mod concurrency; -pub mod data_type; mod element; -mod fill_value; pub mod storage_transformer; +pub use crate::data_type; // re-export for zarrs < 0.20 compat #[cfg(feature = "sharding")] mod array_sharded_ext; @@ -65,11 +64,11 @@ pub use self::{ codec::ArrayCodecTraits, codec::CodecChain, concurrency::RecommendedConcurrency, - data_type::DataType, element::{Element, ElementFixedLength, ElementOwned}, - fill_value::FillValue, storage_transformer::StorageTransformerChain, }; +pub use crate::data_type::{DataType, FillValue}; // re-export for zarrs < 0.20 compat + pub use crate::metadata::v2::ArrayMetadataV2; use crate::metadata::v2_to_v3::ArrayMetadataV2ToV3ConversionError; pub use crate::metadata::v3::{ diff --git a/zarrs/src/array/array_builder.rs b/zarrs/src/array/array_builder.rs index 48a58b82..7ea6d8ba 100644 --- a/zarrs/src/array/array_builder.rs +++ b/zarrs/src/array/array_builder.rs @@ -1,6 +1,7 @@ use std::sync::Arc; use crate::{ + data_type::IncompatibleFillValueError, metadata::{v3::AdditionalFields, ChunkKeySeparator}, node::NodePath, }; @@ -11,7 +12,6 @@ use super::{ array_to_bytes::vlen::VlenCodec, ArrayToArrayCodecTraits, ArrayToBytesCodecTraits, BytesCodec, BytesToBytesCodecTraits, }, - data_type::IncompatibleFillValueError, Array, ArrayCreateError, ArrayMetadata, ArrayMetadataV3, ArrayShape, ChunkGrid, CodecChain, DataType, DimensionName, FillValue, StorageTransformerChain, }; diff --git a/zarrs/src/array/array_errors.rs b/zarrs/src/array/array_errors.rs index 527d0a58..272aff53 100644 --- a/zarrs/src/array/array_errors.rs +++ b/zarrs/src/array/array_errors.rs @@ -2,19 +2,16 @@ use thiserror::Error; use crate::{ array_subset::{ArraySubset, IncompatibleDimensionalityError}, + data_type::{ + IncompatibleFillValueError, IncompatibleFillValueMetadataError, UnsupportedDataTypeError, + }, metadata::v3::UnsupportedAdditionalFieldError, node::NodePathError, plugin::PluginCreateError, storage::StorageError, }; -use super::{ - codec::CodecError, - data_type::{ - IncompatibleFillValueError, IncompatibleFillValueMetadataError, UnsupportedDataTypeError, - }, - ArrayIndices, ArrayShape, -}; +use super::{codec::CodecError, ArrayIndices, ArrayShape}; /// An array creation error. #[derive(Debug, Error)] diff --git a/zarrs/src/array/array_representation.rs b/zarrs/src/array/array_representation.rs index 2c781fc5..5f186d50 100644 --- a/zarrs/src/array/array_representation.rs +++ b/zarrs/src/array/array_representation.rs @@ -1,6 +1,7 @@ use std::num::NonZeroU64; -use super::{data_type::IncompatibleFillValueError, ArrayShape, DataType, DataTypeSize, FillValue}; +use super::{ArrayShape, DataType, DataTypeSize, FillValue}; +use crate::data_type::IncompatibleFillValueError; use derive_more::Display; /// The shape, data type, and fill value of an `array`. diff --git a/zarrs/src/array/codec/array_to_bytes/bytes.rs b/zarrs/src/array/codec/array_to_bytes/bytes.rs index a2970240..4962dc1e 100644 --- a/zarrs/src/array/codec/array_to_bytes/bytes.rs +++ b/zarrs/src/array/codec/array_to_bytes/bytes.rs @@ -73,6 +73,10 @@ pub(crate) fn reverse_endianness(v: &mut [u8], data_type: &DataType) { } // Variable-sized data types are not supported and are rejected outside of this function DataType::String | DataType::Bytes => unreachable!(), + _ => { + // FIXME: Data type extensions, endianness reversal for custom data types + unimplemented!("Reverse endianness for data type {:?}", data_type) + } } } diff --git a/zarrs/src/lib.rs b/zarrs/src/lib.rs index 29586900..e0298e52 100644 --- a/zarrs/src/lib.rs +++ b/zarrs/src/lib.rs @@ -189,6 +189,7 @@ pub mod node; pub mod plugin; pub mod version; +pub use zarrs_data_type as data_type; pub use zarrs_metadata as metadata; pub use zarrs_storage as storage; diff --git a/zarrs/src/plugin.rs b/zarrs/src/plugin.rs index 47486045..ddcc00fe 100644 --- a/zarrs/src/plugin.rs +++ b/zarrs/src/plugin.rs @@ -3,7 +3,8 @@ //! A [`Plugin`] creates objects from [`MetadataV3`] (consisting of a name and optional configuration). //! It is used to implement [Zarr extension points](https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#extension-points), such as [chunk grids][`crate::array::chunk_grid`], [chunk key encodings](`crate::array::chunk_key_encoding`), [codecs](`crate::array::codec`), and [storage transformers](`crate::array::storage_transformer`). //! -//! [Data types](`crate::array::data_type`) are not currently supported as an extension point. +//! [`DataType`](crate::data_type::DataType)s are not currently supported as an extension point. +// FIXME: Data type extensions //! //! Plugins are registered at compile time using the [inventory] crate. //! At runtime, a name matching function is applied to identify which registered plugin is associated with the metadata. diff --git a/zarrs_data_type/CHANGELOG.md b/zarrs_data_type/CHANGELOG.md new file mode 100644 index 00000000..91bd6bbf --- /dev/null +++ b/zarrs_data_type/CHANGELOG.md @@ -0,0 +1,17 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +## [0.1.0] - 2025-01-24 + +### Added +- Initial release +- Split from the `zarrs::array::{data_type,fill_value}` modules of `zarrs` 0.20.0-dev + +[unreleased]: https://github.com/LDeakin/zarrs/compare/zarrs_data_type-v0.1.0...HEAD +[0.1.0]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_data_type-v0.1.0 diff --git a/zarrs_data_type/Cargo.toml b/zarrs_data_type/Cargo.toml new file mode 100644 index 00000000..921a13bf --- /dev/null +++ b/zarrs_data_type/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "zarrs_data_type" +version = "0.1.0" +authors = ["Lachlan Deakin "] +edition = "2021" +rust-version = "1.77" +description = "Zarr data types for the zarrs crate" +homepage = "https://zarrs.dev" +documentation = "https://docs.rs/zarrs_data_type" +repository = "https://github.com/LDeakin/zarrs" +license = "MIT OR Apache-2.0" +keywords = ["zarr", "zarrs"] +categories = ["encoding"] + +[lints] +workspace = true + +[package.metadata.docs.rs] +all-features = true + +[dependencies] +half = { workspace = true } +num = { workspace = true } +thiserror = "2.0.0" +derive_more = { version = "1.0.0", features = ["display", "from"] } +zarrs_metadata = { workspace = true } + +[dev-dependencies] +serde_json = { version = "1.0.71", features = ["float_roundtrip", "preserve_order"] } +bytemuck = { version = "1.14.0", features = ["extern_crate_alloc", "must_cast", "min_const_generics"] } diff --git a/zarrs_data_type/LICENCE-APACHE b/zarrs_data_type/LICENCE-APACHE new file mode 120000 index 00000000..536a3dbc --- /dev/null +++ b/zarrs_data_type/LICENCE-APACHE @@ -0,0 +1 @@ +../LICENCE-APACHE \ No newline at end of file diff --git a/zarrs_data_type/LICENCE-MIT b/zarrs_data_type/LICENCE-MIT new file mode 120000 index 00000000..e259b4c0 --- /dev/null +++ b/zarrs_data_type/LICENCE-MIT @@ -0,0 +1 @@ +../LICENCE-MIT \ No newline at end of file diff --git a/zarrs_data_type/README.md b/zarrs_data_type/README.md new file mode 100644 index 00000000..abe03ad1 --- /dev/null +++ b/zarrs_data_type/README.md @@ -0,0 +1,15 @@ +# zarrs_data_type + +[![Latest Version](https://img.shields.io/crates/v/zarrs_data_type.svg)](https://crates.io/crates/zarrs_data_type) +[![zarrs_data_type documentation](https://docs.rs/zarrs_data_type/badge.svg)](https://docs.rs/zarrs_data_type) +![msrv](https://img.shields.io/crates/msrv/zarrs_data_type) +[![build](https://github.com/LDeakin/zarrs/actions/workflows/ci.yml/badge.svg)](https://github.com/LDeakin/zarrs/actions/workflows/ci.yml) + +[Zarr](https://zarr-specs.readthedocs.io/) data types for the [`zarrs`](https://crates.io/crates/zarrs) Rust crate. + +## Licence +`zarrs_data_type` is licensed under either of + - the Apache License, Version 2.0 [LICENSE-APACHE](./LICENCE-APACHE) or or + - the MIT license [LICENSE-MIT](./LICENCE-MIT) or , at your option. + +Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions. diff --git a/zarrs/src/array/data_type.rs b/zarrs_data_type/src/data_type.rs similarity index 99% rename from zarrs/src/array/data_type.rs rename to zarrs_data_type/src/data_type.rs index 0ca4300e..1d0d64da 100644 --- a/zarrs/src/array/data_type.rs +++ b/zarrs_data_type/src/data_type.rs @@ -6,7 +6,7 @@ use derive_more::From; use half::{bf16, f16}; use thiserror::Error; -use crate::metadata::v3::array::{ +use zarrs_metadata::v3::array::{ data_type::{DataTypeMetadataV3, DataTypeSize}, fill_value::{ bfloat16_to_fill_value, float16_to_fill_value, float32_to_fill_value, @@ -356,7 +356,7 @@ impl core::fmt::Display for DataType { mod tests { use super::*; - use crate::metadata::v3::array::{ + use zarrs_metadata::v3::array::{ fill_value::{FillValueFloatStringNonFinite, HexString}, nan_representations::{ZARR_NAN_BF16, ZARR_NAN_F16, ZARR_NAN_F32, ZARR_NAN_F64}, }; diff --git a/zarrs/src/array/fill_value.rs b/zarrs_data_type/src/fill_value.rs similarity index 95% rename from zarrs/src/array/fill_value.rs rename to zarrs_data_type/src/fill_value.rs index 98560702..fa4609a9 100644 --- a/zarrs/src/array/fill_value.rs +++ b/zarrs_data_type/src/fill_value.rs @@ -233,10 +233,21 @@ impl FillValue { #[cfg(test)] mod tests { - use crate::array::transmute_to_bytes_vec; - use super::*; + /// Convert from `&[T]` to `Vec`. + #[must_use] + fn convert_to_bytes_vec(from: &[T]) -> Vec { + bytemuck::allocation::pod_collect_to_vec(from) + } + + /// Transmute from `Vec` to `Vec`. + #[must_use] + fn transmute_to_bytes_vec(from: Vec) -> Vec { + bytemuck::allocation::try_cast_vec(from) + .unwrap_or_else(|(_err, from)| convert_to_bytes_vec(&from)) + } + #[test] fn fill_value() { assert_eq!( diff --git a/zarrs_data_type/src/lib.rs b/zarrs_data_type/src/lib.rs new file mode 100644 index 00000000..b9a4eeaf --- /dev/null +++ b/zarrs_data_type/src/lib.rs @@ -0,0 +1,10 @@ +//! [Zarr](https://zarr-specs.readthedocs.io/) data types for the [`zarrs`](https://docs.rs/zarrs/latest/zarrs/index.html) crate. + +mod data_type; +mod fill_value; + +pub use data_type::{ + DataType, IncompatibleFillValueError, IncompatibleFillValueMetadataError, + UnsupportedDataTypeError, +}; +pub use fill_value::FillValue; diff --git a/zarrs_metadata/Cargo.toml b/zarrs_metadata/Cargo.toml index 940579cf..be95bac4 100644 --- a/zarrs_metadata/Cargo.toml +++ b/zarrs_metadata/Cargo.toml @@ -17,9 +17,9 @@ workspace = true [dependencies] derive_more = { version = "1.0.0", features = ["display", "from"] } -half = { version = "2.0.0", features = ["bytemuck"] } +half = { workspace = true } monostate = "0.1.0" -num = { version = "0.4.1" } +num = { workspace = true } serde = { version = "1.0.185", features = ["derive"] } serde_json = { version = "1.0.71", features = ["float_roundtrip", "preserve_order"] } serde_repr = "0.1.19" From 4bda6fd96c90f6a8b594efcd43148a9328cc6218 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Wed, 29 Jan 2025 11:35:02 +1100 Subject: [PATCH 28/45] fix(metadata): Interpret a `null` fill value as `""` for Zarr V2 string arrays --- zarrs_metadata/CHANGELOG.md | 3 +++ zarrs_metadata/src/v2_to_v3.rs | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/zarrs_metadata/CHANGELOG.md b/zarrs_metadata/CHANGELOG.md index a75d96ac..5ff25222 100644 --- a/zarrs_metadata/CHANGELOG.md +++ b/zarrs_metadata/CHANGELOG.md @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed +- Interpret a `null` fill value as `""` for Zarr V2 string arrays (for `zarr-python` compatibility) + ## [0.3.0] - 2025-01-10 ### Added diff --git a/zarrs_metadata/src/v2_to_v3.rs b/zarrs_metadata/src/v2_to_v3.rs index a4de5c8a..b922b91a 100644 --- a/zarrs_metadata/src/v2_to_v3.rs +++ b/zarrs_metadata/src/v2_to_v3.rs @@ -96,6 +96,13 @@ pub fn array_metadata_v2_to_v3( // Fill value let mut fill_value = array_metadata_fill_value_v2_to_v3(&array_metadata_v2.fill_value) + .or_else(|| { + // Support zarr-python encoded string arrays with a `null` fill value + match data_type.name().as_str() { + "string" => Some(FillValueMetadataV3::String(String::new())), + _ => None, + } + }) .ok_or_else(|| { // TODO: How best to deal with null fill values? What do other implementations do? ArrayMetadataV2ToV3ConversionError::UnsupportedFillValue( From 1dbffd114f5339e6018e6480471b8887498c9fdc Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Wed, 29 Jan 2025 11:36:50 +1100 Subject: [PATCH 29/45] chore: use new `zarr-python` for string compatibility tests --- zarrs/tests/data/v2_cities.py | 25 ++++++++++--- zarrs/tests/data/v3_cities.py | 32 +++++++++++++---- .../zarr_python_compat/cities_v2.zarr/.zarray | 33 +++++++++--------- .../zarr_python_compat/cities_v2.zarr/.zattrs | 1 + .../zarr_python_compat/cities_v3.zarr/c/47 | Bin 11630 -> 11498 bytes .../cities_v3.zarr/zarr.json | 32 ++++++++++++++++- 6 files changed, 96 insertions(+), 27 deletions(-) create mode 100644 zarrs/tests/data/zarr_python_compat/cities_v2.zarr/.zattrs diff --git a/zarrs/tests/data/v2_cities.py b/zarrs/tests/data/v2_cities.py index b75a2d80..1b330581 100644 --- a/zarrs/tests/data/v2_cities.py +++ b/zarrs/tests/data/v2_cities.py @@ -1,12 +1,29 @@ +#!/usr/bin/env -S uv run +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "zarr==3.0.1", +# "pandas==2.2.3" +# ] +# /// + import zarr import pandas as pd -print(zarr.__version__) # This was generated with zarr==2.18 - df = pd.read_csv("tests/data/cities.csv", header=None) cities = df[0] -path_out = 'tests/data/zarr_python_compat/cities_v2.zarr' -array = zarr.open(path_out, mode='w', dtype=str, shape=(len(cities),), chunks=(1000,), compressor = None, fill_value='') +path_out = "tests/data/zarr_python_compat/cities_v2.zarr" +array = zarr.create_array( + path_out, + dtype=str, + shape=(len(cities),), + chunks=(1000,), + filters=zarr.codecs.vlen_utf8.VLenUTF8(), + compressors=[None], + # fill_value="", + zarr_format=2, + overwrite=True, +) array[:] = cities.values print(array.info) diff --git a/zarrs/tests/data/v3_cities.py b/zarrs/tests/data/v3_cities.py index 93fa6bed..ecca4a14 100644 --- a/zarrs/tests/data/v3_cities.py +++ b/zarrs/tests/data/v3_cities.py @@ -1,19 +1,39 @@ +#!/usr/bin/env -S uv run +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "zarr==3.0.1", +# "pandas==2.2.3" +# ] +# /// + import zarr import pandas as pd -print(zarr.__version__) # This was generate with zarr==2.18 - df = pd.read_csv("tests/data/cities.csv", header=None) cities = df[0] -path_out = 'tests/data/zarr_python_compat/cities_v3.zarr' -array = zarr.open(path_out, mode='w', dtype=str, shape=(len(cities),), chunks=(1000,)) +path_out = "tests/data/zarr_python_compat/cities_v3.zarr" +array = zarr.create_array( + path_out, + dtype=str, + shape=(len(cities),), + chunks=(1000,), + compressors=[], + zarr_format=3, + overwrite=True, +) array[:] = cities.values print(array.info) -array_v2 = zarr.open('tests/data/zarr_python_compat/cities_v2.zarr', dtype=str, shape=(len(cities),), chunks=(1000,)) +array_v2 = zarr.open( + "tests/data/zarr_python_compat/cities_v2.zarr", + dtype=str, + shape=(len(cities),), + chunks=(1000,), +) -assert((array[:] == array_v2[:]).all()) +assert (array[:] == array_v2[:]).all() # for i in range(48): # v2 = open(f'tests/data/v2/cities.zarr/{i}', 'rb').read() diff --git a/zarrs/tests/data/zarr_python_compat/cities_v2.zarr/.zarray b/zarrs/tests/data/zarr_python_compat/cities_v2.zarr/.zarray index f459a409..1337f1cb 100644 --- a/zarrs/tests/data/zarr_python_compat/cities_v2.zarr/.zarray +++ b/zarrs/tests/data/zarr_python_compat/cities_v2.zarr/.zarray @@ -1,18 +1,19 @@ { - "chunks": [ - 1000 - ], - "compressor": null, - "dtype": "|O", - "fill_value": "", - "filters": [ - { - "id": "vlen-utf8" - } - ], - "order": "C", - "shape": [ - 47868 - ], - "zarr_format": 2 + "shape": [ + 47868 + ], + "chunks": [ + 1000 + ], + "fill_value": null, + "order": "C", + "filters": [ + { + "id": "vlen-utf8" + } + ], + "dimension_separator": ".", + "compressor": null, + "zarr_format": 2, + "dtype": "|O" } \ No newline at end of file diff --git a/zarrs/tests/data/zarr_python_compat/cities_v2.zarr/.zattrs b/zarrs/tests/data/zarr_python_compat/cities_v2.zarr/.zattrs new file mode 100644 index 00000000..9e26dfee --- /dev/null +++ b/zarrs/tests/data/zarr_python_compat/cities_v2.zarr/.zattrs @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/zarrs/tests/data/zarr_python_compat/cities_v3.zarr/c/47 b/zarrs/tests/data/zarr_python_compat/cities_v3.zarr/c/47 index 4ee64533ef53f561d849aa1e182128533a8bd2f6..09682c9c951d73f07f8e896a6dc413ba8b19b3a3 100644 GIT binary patch delta 540 bcmaDC^(u11EvA=4`lE&)DNib3 Date: Wed, 29 Jan 2025 11:40:34 +1100 Subject: [PATCH 30/45] zarrs_metadata: prepare 0.3.1 release --- zarrs_metadata/CHANGELOG.md | 5 ++++- zarrs_metadata/Cargo.toml | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/zarrs_metadata/CHANGELOG.md b/zarrs_metadata/CHANGELOG.md index 5ff25222..81f3923f 100644 --- a/zarrs_metadata/CHANGELOG.md +++ b/zarrs_metadata/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.1] - 2025-01-29 + ### Fixed - Interpret a `null` fill value as `""` for Zarr V2 string arrays (for `zarr-python` compatibility) @@ -59,7 +61,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Initial release - Split from the `metadata` module of `zarrs` 0.17.0-dev -[unreleased]: https://github.com/LDeakin/zarrs/compare/zarrs_metadata-v0.3.0...HEAD +[unreleased]: https://github.com/LDeakin/zarrs/compare/zarrs_metadata-v0.3.1...HEAD +[0.3.1]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.3.1 [0.3.0]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.3.0 [0.2.0]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.2.0 [0.1.0]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.1.0 diff --git a/zarrs_metadata/Cargo.toml b/zarrs_metadata/Cargo.toml index be95bac4..c1ef9f65 100644 --- a/zarrs_metadata/Cargo.toml +++ b/zarrs_metadata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "zarrs_metadata" -version = "0.3.0" +version = "0.3.1" authors = ["Lachlan Deakin "] edition = "2021" rust-version = "1.77" From 0532fe983b7b42b59dbf84e50a2fe5e6f7bad4ce Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Sat, 1 Feb 2025 11:57:40 +1100 Subject: [PATCH 31/45] chore(deps): bump `lru` to 0.13 --- CHANGELOG.md | 1 + zarrs/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 32f0bc81..3875821f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Breaking**: `ArrayBytes::new_vlen()` not returns a `Result` and validates bytes/offsets compatibility - Reenable broken compatibility tests since fixed in `zarr-python`/`numcodecs` - **Breaking**: move the `zarrs::array::{data_type,fill_value}` modules into the `zarrs_data_type` crate +- Bump `lru` to 0.13 ## [0.19.1] - 2025-01-19 diff --git a/zarrs/Cargo.toml b/zarrs/Cargo.toml index 5afcb988..881b5493 100644 --- a/zarrs/Cargo.toml +++ b/zarrs/Cargo.toml @@ -55,7 +55,7 @@ gdeflate-sys = { version = "0.4.1", optional = true } half = { workspace = true } inventory = "0.3.0" itertools = "0.14.0" -lru = "0.12.4" +lru = "0.13.0" moka = { version = "0.12.8", features = ["sync"] } ndarray = { version = ">=0.15.0,<17", optional = true } num = { workspace = true } From 9070e12ea06c297532347af3668be9927ba35fa1 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Tue, 4 Feb 2025 07:16:38 +1100 Subject: [PATCH 32/45] fix: docs gen in `vlen_v2_codec` macro --- .../src/array/codec/array_to_bytes/vlen_v2/vlen_v2_macros.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_macros.rs b/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_macros.rs index 40a887be..8a5f35b2 100644 --- a/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_macros.rs +++ b/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_macros.rs @@ -55,14 +55,14 @@ macro_rules! vlen_v2_codec { #[cfg(feature = "async")] use crate::array::codec::{AsyncArrayPartialDecoderTraits, AsyncBytesPartialDecoderTraits}; - /// The `$identifier` codec implementation. + #[doc = concat!("The `", $identifier, "` codec implementation.")] #[derive(Debug, Clone)] pub struct $struct { inner: Arc, } impl $struct { - /// Create a new `$identifier` codec. + #[doc = concat!("Create a new `", $identifier, "` codec.")] #[must_use] pub fn new() -> Self { Self { From 20089c559484384fc30c229d08731715aefe1aba Mon Sep 17 00:00:00 2001 From: Zhuoqing Fang Date: Mon, 3 Feb 2025 14:56:45 -0800 Subject: [PATCH 33/45] handle fill_value datatype for string-array (#140) * handle fill_value datatype for string-array * fix 0 -> '' * fix: minimise "string"/0 fill value workaround * chore: add V2 string fill value tests --------- Co-authored-by: Zhuoqing Fang Co-authored-by: Lachlan Deakin --- zarrs/tests/data/v2_str0.py | 42 ++++++++++++++++++ .../str_v2_fv_0.zarr/.zarray | 19 ++++++++ .../str_v2_fv_0.zarr/.zattrs | 1 + .../zarr_python_compat/str_v2_fv_0.zarr/0 | Bin 0 -> 15 bytes .../zarr_python_compat/str_v2_fv_0.zarr/1 | Bin 0 -> 12 bytes .../str_v2_fv_null.zarr/.zarray | 19 ++++++++ .../str_v2_fv_null.zarr/.zattrs | 1 + .../zarr_python_compat/str_v2_fv_null.zarr/0 | Bin 0 -> 15 bytes .../zarr_python_compat/str_v2_fv_null.zarr/1 | Bin 0 -> 12 bytes zarrs/tests/zarr_python_compat.rs | 28 ++++++++++++ zarrs_metadata/src/v2_to_v3.rs | 5 +++ 11 files changed, 115 insertions(+) create mode 100755 zarrs/tests/data/v2_str0.py create mode 100644 zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/.zarray create mode 100644 zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/.zattrs create mode 100644 zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/0 create mode 100644 zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/1 create mode 100644 zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/.zarray create mode 100644 zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/.zattrs create mode 100644 zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/0 create mode 100644 zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/1 diff --git a/zarrs/tests/data/v2_str0.py b/zarrs/tests/data/v2_str0.py new file mode 100755 index 00000000..162724ee --- /dev/null +++ b/zarrs/tests/data/v2_str0.py @@ -0,0 +1,42 @@ +#!/usr/bin/env -S uv run +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "zarr==3.0.1", +# ] +# /// + +import zarr + +path_out = "tests/data/zarr_python_compat/str_v2_fv_0.zarr" +array = zarr.create_array( + path_out, + dtype=str, + shape=(5,), + chunks=(2,), + filters=zarr.codecs.vlen_utf8.VLenUTF8(), + compressors=[None], + fill_value=0, + zarr_format=2, + overwrite=True, +) +array[:3] = ["a", "bb", ""] +print(array.info) +# assert (array[:] == ["a", "bb", "", "", ""]).all() # FAILURE + +path_out = "tests/data/zarr_python_compat/str_v2_fv_null.zarr" +array = zarr.create_array( + path_out, + dtype=str, + shape=(5,), + chunks=(2,), + filters=zarr.codecs.vlen_utf8.VLenUTF8(), + compressors=[None], + fill_value=None, + zarr_format=2, + overwrite=True, +) +array[:3] = ["a", "bb", ""] +print(array.info) +print(array[:]) +assert (array[:] == ["a", "bb", "", "", ""]).all() \ No newline at end of file diff --git a/zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/.zarray b/zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/.zarray new file mode 100644 index 00000000..aae1beed --- /dev/null +++ b/zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/.zarray @@ -0,0 +1,19 @@ +{ + "shape": [ + 5 + ], + "chunks": [ + 2 + ], + "fill_value": 0, + "order": "C", + "filters": [ + { + "id": "vlen-utf8" + } + ], + "dimension_separator": ".", + "compressor": null, + "zarr_format": 2, + "dtype": "|O" +} \ No newline at end of file diff --git a/zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/.zattrs b/zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/.zattrs new file mode 100644 index 00000000..9e26dfee --- /dev/null +++ b/zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/.zattrs @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/0 b/zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/0 new file mode 100644 index 0000000000000000000000000000000000000000..72190f219c3972acc03de5b5e7c9c5d1dc09e2d2 GIT binary patch literal 15 TcmZQ#U|?Vb;zTAOk(2}g1Q7u% literal 0 HcmV?d00001 diff --git a/zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/1 b/zarrs/tests/data/zarr_python_compat/str_v2_fv_0.zarr/1 new file mode 100644 index 0000000000000000000000000000000000000000..3ae168742fb512b29aeaf6283b3f2534804fb9be GIT binary patch literal 12 KcmZQ#KmY&$Bme^d literal 0 HcmV?d00001 diff --git a/zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/.zarray b/zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/.zarray new file mode 100644 index 00000000..a1b39c04 --- /dev/null +++ b/zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/.zarray @@ -0,0 +1,19 @@ +{ + "shape": [ + 5 + ], + "chunks": [ + 2 + ], + "fill_value": null, + "order": "C", + "filters": [ + { + "id": "vlen-utf8" + } + ], + "dimension_separator": ".", + "compressor": null, + "zarr_format": 2, + "dtype": "|O" +} \ No newline at end of file diff --git a/zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/.zattrs b/zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/.zattrs new file mode 100644 index 00000000..9e26dfee --- /dev/null +++ b/zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/.zattrs @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/0 b/zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/0 new file mode 100644 index 0000000000000000000000000000000000000000..72190f219c3972acc03de5b5e7c9c5d1dc09e2d2 GIT binary patch literal 15 TcmZQ#U|?Vb;zTAOk(2}g1Q7u% literal 0 HcmV?d00001 diff --git a/zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/1 b/zarrs/tests/data/zarr_python_compat/str_v2_fv_null.zarr/1 new file mode 100644 index 0000000000000000000000000000000000000000..3ae168742fb512b29aeaf6283b3f2534804fb9be GIT binary patch literal 12 KcmZQ#KmY&$Bme^d literal 0 HcmV?d00001 diff --git a/zarrs/tests/zarr_python_compat.rs b/zarrs/tests/zarr_python_compat.rs index 54b4d195..7f1974ca 100644 --- a/zarrs/tests/zarr_python_compat.rs +++ b/zarrs/tests/zarr_python_compat.rs @@ -43,3 +43,31 @@ fn zarr_python_compat_fletcher32_v2() -> Result<(), Box> { Ok(()) } + +#[test] +fn zarr_python_v2_compat_str_fv_0() -> Result<(), Box> { + let store = Arc::new(FilesystemStore::new( + "tests/data/zarr_python_compat/str_v2_fv_0.zarr", + )?); + let array = zarrs::array::Array::open(store.clone(), "/")?; + let subset_all = array.subset_all(); + let elements = array.retrieve_array_subset_elements::(&subset_all)?; + + assert_eq!(elements, &["a", "bb", "", "", ""]); + + Ok(()) +} + +#[test] +fn zarr_python_v2_compat_str_fv_null() -> Result<(), Box> { + let store = Arc::new(FilesystemStore::new( + "tests/data/zarr_python_compat/str_v2_fv_null.zarr", + )?); + let array = zarrs::array::Array::open(store.clone(), "/")?; + let subset_all = array.subset_all(); + let elements = array.retrieve_array_subset_elements::(&subset_all)?; + + assert_eq!(elements, &["a", "bb", "", "", ""]); + + Ok(()) +} diff --git a/zarrs_metadata/src/v2_to_v3.rs b/zarrs_metadata/src/v2_to_v3.rs index b922b91a..7c785a23 100644 --- a/zarrs_metadata/src/v2_to_v3.rs +++ b/zarrs_metadata/src/v2_to_v3.rs @@ -124,6 +124,11 @@ pub fn array_metadata_v2_to_v3( )); } } + } else if data_type.name() == "string" { + // Add a special case for `zarr-python` string data with a 0 fill value -> empty string + if let Some(0) = fill_value.try_as_uint::() { + fill_value = FillValueMetadataV3::String(String::new()); + } } let mut codecs: Vec = vec![]; From 7dfb22f52f1083f118473be782b3de945be49fde Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Tue, 4 Feb 2025 10:00:09 +1100 Subject: [PATCH 34/45] chore: #140 changelog --- zarrs_metadata/CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/zarrs_metadata/CHANGELOG.md b/zarrs_metadata/CHANGELOG.md index 81f3923f..546c47e6 100644 --- a/zarrs_metadata/CHANGELOG.md +++ b/zarrs_metadata/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed +- Interpret a `0` fill value as `""` for Zarr V2 string arrays (for `zarr-python` compatibility) ([#140] by [@zqfang]) + +[#140]: https://github.com/LDeakin/zarrs/pull/140 + ## [0.3.1] - 2025-01-29 ### Fixed @@ -66,3 +71,5 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 [0.3.0]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.3.0 [0.2.0]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.2.0 [0.1.0]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.1.0 + +[@zqfang]: https://github.com/zqfang From 209b64a5d29a90af1d87518e7f47e5889ffb5d09 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Tue, 4 Feb 2025 10:07:08 +1100 Subject: [PATCH 35/45] feat(metadata): add `codec_metadata_v2_to_v3` (#141) --- zarrs_metadata/CHANGELOG.md | 4 + zarrs_metadata/src/v2/array.rs | 2 +- zarrs_metadata/src/v2_to_v3.rs | 173 +++++++++++++++++++-------------- 3 files changed, 104 insertions(+), 75 deletions(-) diff --git a/zarrs_metadata/CHANGELOG.md b/zarrs_metadata/CHANGELOG.md index 546c47e6..34aa2546 100644 --- a/zarrs_metadata/CHANGELOG.md +++ b/zarrs_metadata/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- Derive `Copy` for `ArrayMetadataV2Order` +- Add `codec_metadata_v2_to_v3` + ### Fixed - Interpret a `0` fill value as `""` for Zarr V2 string arrays (for `zarr-python` compatibility) ([#140] by [@zqfang]) diff --git a/zarrs_metadata/src/v2/array.rs b/zarrs_metadata/src/v2/array.rs index aedebbfa..c679e5fb 100644 --- a/zarrs_metadata/src/v2/array.rs +++ b/zarrs_metadata/src/v2/array.rs @@ -296,7 +296,7 @@ impl Serialize for FillValueMetadataV2 { } /// The layout of bytes within each chunk of the array. -#[derive(Serialize, Deserialize, Clone, PartialEq, Eq, Debug)] +#[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq, Debug)] pub enum ArrayMetadataV2Order { /// Row-major order. The last dimension varies fastest. C, diff --git a/zarrs_metadata/src/v2_to_v3.rs b/zarrs_metadata/src/v2_to_v3.rs index 7c785a23..6dc38ff2 100644 --- a/zarrs_metadata/src/v2_to_v3.rs +++ b/zarrs_metadata/src/v2_to_v3.rs @@ -11,7 +11,7 @@ use crate::{ data_type_metadata_v2_to_endianness, ArrayMetadataV2Order, DataTypeMetadataV2, DataTypeMetadataV2InvalidEndiannessError, FillValueMetadataV2, }, - ArrayMetadataV2, GroupMetadataV2, + ArrayMetadataV2, GroupMetadataV2, MetadataV2, }, v3::{ array::{ @@ -25,6 +25,7 @@ use crate::{ }, ArrayMetadataV3, GroupMetadataV3, MetadataV3, }, + Endianness, }; use super::v3::array::data_type::DataTypeMetadataV3; @@ -61,85 +62,27 @@ pub enum ArrayMetadataV2ToV3ConversionError { Other(String), } -/// Convert Zarr V2 array metadata to V3. +/// Convert Zarr V2 codec metadata to the equivalent Zarr V3 codec metadata. /// /// # Errors /// Returns a [`ArrayMetadataV2ToV3ConversionError`] if the metadata is invalid or is not compatible with Zarr V3 metadata. -#[allow(clippy::too_many_lines)] -pub fn array_metadata_v2_to_v3( - array_metadata_v2: &ArrayMetadataV2, -) -> Result { - let shape = array_metadata_v2.shape.clone(); - let chunk_grid = MetadataV3::new_with_serializable_configuration( - crate::v3::array::chunk_grid::regular::IDENTIFIER, - &RegularChunkGridConfiguration { - chunk_shape: array_metadata_v2.chunks.clone(), - }, - )?; - - let (Ok(data_type), endianness) = ( - data_type_metadata_v2_to_v3_data_type(&array_metadata_v2.dtype), - data_type_metadata_v2_to_endianness(&array_metadata_v2.dtype) - .map_err(ArrayMetadataV2ToV3ConversionError::InvalidEndianness)?, - ) else { - return Err(ArrayMetadataV2ToV3ConversionError::UnsupportedDataType( - match &array_metadata_v2.dtype { - DataTypeMetadataV2::Simple(dtype) => dtype.clone(), - DataTypeMetadataV2::Structured(dtype) => { - return Err(ArrayMetadataV2ToV3ConversionError::UnsupportedDataType( - format!("{dtype:?}"), - )) - } - }, - )); - }; - - // Fill value - let mut fill_value = array_metadata_fill_value_v2_to_v3(&array_metadata_v2.fill_value) - .or_else(|| { - // Support zarr-python encoded string arrays with a `null` fill value - match data_type.name().as_str() { - "string" => Some(FillValueMetadataV3::String(String::new())), - _ => None, - } - }) - .ok_or_else(|| { - // TODO: How best to deal with null fill values? What do other implementations do? - ArrayMetadataV2ToV3ConversionError::UnsupportedFillValue( - data_type.to_string(), - array_metadata_v2.fill_value.clone(), - ) - })?; - if data_type.name() == "bool" { - // Map a 0/1 scalar fill value to a bool - if let Some(fill_value_uint) = fill_value.try_as_uint::() { - if fill_value_uint == 0 { - fill_value = FillValueMetadataV3::Bool(false); - } else if fill_value_uint == 1 { - fill_value = FillValueMetadataV3::Bool(true); - } else { - return Err(ArrayMetadataV2ToV3ConversionError::UnsupportedFillValue( - data_type.to_string(), - array_metadata_v2.fill_value.clone(), - )); - } - } - } else if data_type.name() == "string" { - // Add a special case for `zarr-python` string data with a 0 fill value -> empty string - if let Some(0) = fill_value.try_as_uint::() { - fill_value = FillValueMetadataV3::String(String::new()); - } - } - +pub fn codec_metadata_v2_to_v3( + order: ArrayMetadataV2Order, + dimensionality: usize, + data_type: &DataTypeMetadataV3, + endianness: Option, + filters: &Option>, + compressor: &Option, +) -> Result, ArrayMetadataV2ToV3ConversionError> { let mut codecs: Vec = vec![]; // Array-to-array codecs - if array_metadata_v2.order == ArrayMetadataV2Order::F { + if order == ArrayMetadataV2Order::F { let transpose_metadata = MetadataV3::new_with_serializable_configuration( crate::v3::array::codec::transpose::IDENTIFIER, &TransposeCodecConfigurationV1 { order: { - let f_order: Vec = (0..array_metadata_v2.shape.len()).rev().collect(); + let f_order: Vec = (0..dimensionality).rev().collect(); unsafe { // SAFETY: f_order is valid TransposeOrder::new(&f_order).unwrap_unchecked() @@ -152,7 +95,7 @@ pub fn array_metadata_v2_to_v3( // Filters (array to array or array to bytes codecs) let mut has_array_to_bytes = false; - if let Some(filters) = &array_metadata_v2.filters { + if let Some(filters) = filters { for filter in filters { // TODO: Add a V2 registry with V2 to V3 conversion functions match filter.id() { @@ -175,7 +118,7 @@ pub fn array_metadata_v2_to_v3( } // Compressor (array to bytes codec) - if let Some(compressor) = &array_metadata_v2.compressor { + if let Some(compressor) = compressor { #[allow(clippy::single_match)] match compressor.id() { crate::v2::array::codec::zfpy::IDENTIFIER => { @@ -211,7 +154,7 @@ pub fn array_metadata_v2_to_v3( } // Compressor (bytes to bytes codec) - if let Some(compressor) = &array_metadata_v2.compressor { + if let Some(compressor) = compressor { match compressor.id() { crate::v2::array::codec::zfpy::IDENTIFIER | crate::v3::array::codec::pcodec::IDENTIFIER => { @@ -221,7 +164,7 @@ pub fn array_metadata_v2_to_v3( let blosc = serde_json::from_value::( serde_json::to_value(compressor.configuration())?, )?; - let configuration = codec_blosc_v2_numcodecs_to_v3(&blosc, &data_type); + let configuration = codec_blosc_v2_numcodecs_to_v3(&blosc, data_type); codecs.push(MetadataV3::new_with_serializable_configuration( crate::v3::array::codec::blosc::IDENTIFIER, &configuration, @@ -244,6 +187,88 @@ pub fn array_metadata_v2_to_v3( }; } + Ok(codecs) +} + +/// Convert Zarr V2 array metadata to V3. +/// +/// # Errors +/// Returns a [`ArrayMetadataV2ToV3ConversionError`] if the metadata is invalid or is not compatible with Zarr V3 metadata. +#[allow(clippy::too_many_lines)] +pub fn array_metadata_v2_to_v3( + array_metadata_v2: &ArrayMetadataV2, +) -> Result { + let shape = array_metadata_v2.shape.clone(); + let chunk_grid = MetadataV3::new_with_serializable_configuration( + crate::v3::array::chunk_grid::regular::IDENTIFIER, + &RegularChunkGridConfiguration { + chunk_shape: array_metadata_v2.chunks.clone(), + }, + )?; + + let (Ok(data_type), endianness) = ( + data_type_metadata_v2_to_v3_data_type(&array_metadata_v2.dtype), + data_type_metadata_v2_to_endianness(&array_metadata_v2.dtype) + .map_err(ArrayMetadataV2ToV3ConversionError::InvalidEndianness)?, + ) else { + return Err(ArrayMetadataV2ToV3ConversionError::UnsupportedDataType( + match &array_metadata_v2.dtype { + DataTypeMetadataV2::Simple(dtype) => dtype.clone(), + DataTypeMetadataV2::Structured(dtype) => { + return Err(ArrayMetadataV2ToV3ConversionError::UnsupportedDataType( + format!("{dtype:?}"), + )) + } + }, + )); + }; + + // Fill value + let mut fill_value = array_metadata_fill_value_v2_to_v3(&array_metadata_v2.fill_value) + .or_else(|| { + // Support zarr-python encoded string arrays with a `null` fill value + match data_type.name().as_str() { + "string" => Some(FillValueMetadataV3::String(String::new())), + _ => None, + } + }) + .ok_or_else(|| { + // TODO: How best to deal with null fill values? What do other implementations do? + ArrayMetadataV2ToV3ConversionError::UnsupportedFillValue( + data_type.to_string(), + array_metadata_v2.fill_value.clone(), + ) + })?; + if data_type.name() == "bool" { + // Map a 0/1 scalar fill value to a bool + if let Some(fill_value_uint) = fill_value.try_as_uint::() { + if fill_value_uint == 0 { + fill_value = FillValueMetadataV3::Bool(false); + } else if fill_value_uint == 1 { + fill_value = FillValueMetadataV3::Bool(true); + } else { + return Err(ArrayMetadataV2ToV3ConversionError::UnsupportedFillValue( + data_type.to_string(), + array_metadata_v2.fill_value.clone(), + )); + } + } + } else if data_type.name() == "string" { + // Add a special case for `zarr-python` string data with a 0 fill value -> empty string + if let Some(0) = fill_value.try_as_uint::() { + fill_value = FillValueMetadataV3::String(String::new()); + } + } + + let codecs = codec_metadata_v2_to_v3( + array_metadata_v2.order, + array_metadata_v2.shape.len(), + &data_type, + endianness, + &array_metadata_v2.filters, + &array_metadata_v2.compressor, + )?; + let chunk_key_encoding = MetadataV3::new_with_serializable_configuration( crate::v3::array::chunk_key_encoding::v2::IDENTIFIER, &V2ChunkKeyEncodingConfiguration { From fd10d5629c781f9fe01c544aeed42a7ea0e11c98 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Tue, 4 Feb 2025 10:08:28 +1100 Subject: [PATCH 36/45] zarrs_metadata: prepare 0.3.2 release --- zarrs_metadata/CHANGELOG.md | 5 ++++- zarrs_metadata/Cargo.toml | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/zarrs_metadata/CHANGELOG.md b/zarrs_metadata/CHANGELOG.md index 34aa2546..b1ab7380 100644 --- a/zarrs_metadata/CHANGELOG.md +++ b/zarrs_metadata/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.2] - 2025-02-04 + ### Added - Derive `Copy` for `ArrayMetadataV2Order` - Add `codec_metadata_v2_to_v3` @@ -70,7 +72,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Initial release - Split from the `metadata` module of `zarrs` 0.17.0-dev -[unreleased]: https://github.com/LDeakin/zarrs/compare/zarrs_metadata-v0.3.1...HEAD +[unreleased]: https://github.com/LDeakin/zarrs/compare/zarrs_metadata-v0.3.2...HEAD +[0.3.2]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.3.2 [0.3.1]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.3.1 [0.3.0]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.3.0 [0.2.0]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.2.0 diff --git a/zarrs_metadata/Cargo.toml b/zarrs_metadata/Cargo.toml index c1ef9f65..e49b07fb 100644 --- a/zarrs_metadata/Cargo.toml +++ b/zarrs_metadata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "zarrs_metadata" -version = "0.3.1" +version = "0.3.2" authors = ["Lachlan Deakin "] edition = "2021" rust-version = "1.77" From 9d05d146cc2785c45eba7531694730e626557b7a Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Thu, 6 Feb 2025 09:41:49 +1100 Subject: [PATCH 37/45] fix: use `bytes` codec with native endianness if unset for a Zarr V2 array --- zarrs_metadata/CHANGELOG.md | 3 +++ zarrs_metadata/src/v2_to_v3.rs | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/zarrs_metadata/CHANGELOG.md b/zarrs_metadata/CHANGELOG.md index b1ab7380..150f2005 100644 --- a/zarrs_metadata/CHANGELOG.md +++ b/zarrs_metadata/CHANGELOG.md @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed +- Use `bytes` codec with native endianness if unset for a Zarr V2 array + ## [0.3.2] - 2025-02-04 ### Added diff --git a/zarrs_metadata/src/v2_to_v3.rs b/zarrs_metadata/src/v2_to_v3.rs index 6dc38ff2..3a203a10 100644 --- a/zarrs_metadata/src/v2_to_v3.rs +++ b/zarrs_metadata/src/v2_to_v3.rs @@ -66,6 +66,7 @@ pub enum ArrayMetadataV2ToV3ConversionError { /// /// # Errors /// Returns a [`ArrayMetadataV2ToV3ConversionError`] if the metadata is invalid or is not compatible with Zarr V3 metadata. +#[allow(clippy::too_many_lines)] pub fn codec_metadata_v2_to_v3( order: ArrayMetadataV2Order, dimensionality: usize, @@ -148,7 +149,9 @@ pub fn codec_metadata_v2_to_v3( if !has_array_to_bytes { let bytes_metadata = MetadataV3::new_with_serializable_configuration( crate::v3::array::codec::bytes::IDENTIFIER, - &BytesCodecConfigurationV1 { endian: endianness }, + &BytesCodecConfigurationV1 { + endian: Some(endianness.unwrap_or(Endianness::native())), + }, )?; codecs.push(bytes_metadata); } From 623b918e3efce25f09941899c1c600798cfb758a Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Thu, 6 Feb 2025 09:46:07 +1100 Subject: [PATCH 38/45] fix: permit string compression levels in `zstd` codec metadata (for `zarr-python` compatibility) --- zarrs_metadata/CHANGELOG.md | 1 + zarrs_metadata/src/v3/array/codec/zstd.rs | 26 ++++++++++++++++++----- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/zarrs_metadata/CHANGELOG.md b/zarrs_metadata/CHANGELOG.md index 150f2005..7f22d249 100644 --- a/zarrs_metadata/CHANGELOG.md +++ b/zarrs_metadata/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Fixed +- Permit string compression levels in `zstd` codec metadata (for `zarr-python` compatibility) - Use `bytes` codec with native endianness if unset for a Zarr V2 array ## [0.3.2] - 2025-02-04 diff --git a/zarrs_metadata/src/v3/array/codec/zstd.rs b/zarrs_metadata/src/v3/array/codec/zstd.rs index 2989d432..515a46e4 100644 --- a/zarrs_metadata/src/v3/array/codec/zstd.rs +++ b/zarrs_metadata/src/v3/array/codec/zstd.rs @@ -1,5 +1,6 @@ use derive_more::{Display, From}; use serde::{Deserialize, Serialize}; +use serde_json::Value; /// The identifier for the `zstd` codec. pub const IDENTIFIER: &str = "zstd"; @@ -40,12 +41,27 @@ pub struct ZstdCompressionLevel(i32); impl<'de> serde::Deserialize<'de> for ZstdCompressionLevel { fn deserialize>(d: D) -> Result { - let number = serde_json::Number::deserialize(d)?; - if let Some(number) = number.as_i64() { - if (-131_072..=22).contains(&number) { - #[allow(clippy::cast_possible_truncation)] - return Ok(Self(number as i32)); + let value = Value::deserialize(d)?; + match value { + Value::Number(number) => { + if let Some(number) = number.as_i64() { + if (-131_072..=22).contains(&number) { + #[allow(clippy::cast_possible_truncation)] + return Ok(Self(number as i32)); + } + } } + Value::String(string) => { + // COMPATIBILITY: support data created with zarr-python that uses a string for the level + // https://github.com/zarr-developers/zarr-python/blob/a52048ddb2d5d069c3404e7457439a9ecb5e40c3/tests/test_v2.py#L278-L280 + if let Ok(number) = string.parse::() { + if (-131_072..=22).contains(&number) { + #[allow(clippy::cast_possible_truncation)] + return Ok(Self(number as i32)); + } + } + } + _ => {} } Err(serde::de::Error::custom( "Zstd compression level must be an integer between -131072 and 22", From 3069b6f0aec1b18f8cb53a06e5041034b5438115 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Thu, 6 Feb 2025 09:46:50 +1100 Subject: [PATCH 39/45] zarrs_metadata: prepare 0.3.3 release --- zarrs_metadata/CHANGELOG.md | 5 ++++- zarrs_metadata/Cargo.toml | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/zarrs_metadata/CHANGELOG.md b/zarrs_metadata/CHANGELOG.md index 7f22d249..45712372 100644 --- a/zarrs_metadata/CHANGELOG.md +++ b/zarrs_metadata/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.3] - 2025-02-06 + ### Fixed - Permit string compression levels in `zstd` codec metadata (for `zarr-python` compatibility) - Use `bytes` codec with native endianness if unset for a Zarr V2 array @@ -76,7 +78,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Initial release - Split from the `metadata` module of `zarrs` 0.17.0-dev -[unreleased]: https://github.com/LDeakin/zarrs/compare/zarrs_metadata-v0.3.2...HEAD +[unreleased]: https://github.com/LDeakin/zarrs/compare/zarrs_metadata-v0.3.3...HEAD +[0.3.3]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.3.3 [0.3.2]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.3.2 [0.3.1]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.3.1 [0.3.0]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.3.0 diff --git a/zarrs_metadata/Cargo.toml b/zarrs_metadata/Cargo.toml index e49b07fb..9e6effa7 100644 --- a/zarrs_metadata/Cargo.toml +++ b/zarrs_metadata/Cargo.toml @@ -1,12 +1,12 @@ [package] name = "zarrs_metadata" -version = "0.3.2" +version = "0.3.3" authors = ["Lachlan Deakin "] edition = "2021" rust-version = "1.77" description = "Zarr metadata support for the zarrs crate" homepage = "https://zarrs.dev" -documentation = "https://docs.rs/zarrs_object_store" +documentation = "https://docs.rs/zarrs_metadata" repository = "https://github.com/LDeakin/zarrs" license = "MIT OR Apache-2.0" keywords = ["zarr", "zarrs", "metadata"] From d24d316c09825400983c9ee7bedb97e5c9264bf4 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Sat, 8 Feb 2025 10:04:27 +1100 Subject: [PATCH 40/45] fix(http): crate manifest `documentation` and `keywords` --- zarrs_http/CHANGELOG.md | 3 +++ zarrs_http/Cargo.toml | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/zarrs_http/CHANGELOG.md b/zarrs_http/CHANGELOG.md index 848702e8..3a6b3984 100644 --- a/zarrs_http/CHANGELOG.md +++ b/zarrs_http/CHANGELOG.md @@ -10,6 +10,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Bump `itertools` to 0.14 +### Fixed +- Fixed crate manifest `documentation` and `keywords` + ## [0.2.0] - 2024-11-15 ### Changed diff --git a/zarrs_http/Cargo.toml b/zarrs_http/Cargo.toml index 68dd11c5..008381a4 100644 --- a/zarrs_http/Cargo.toml +++ b/zarrs_http/Cargo.toml @@ -6,10 +6,10 @@ edition = "2021" rust-version = "1.77" description = "A synchronous http store for the zarrs crate" homepage = "https://zarrs.dev" -documentation = "https://docs.rs/zarrs_storage" +documentation = "https://docs.rs/zarrs_http" repository = "https://github.com/LDeakin/zarrs" license = "MIT OR Apache-2.0" -keywords = ["zarr", "zarrs", "storage", "store"] +keywords = ["zarr", "zarrs", "http"] categories = ["encoding"] [lints] From e55b56f3156f101526c10c764af8243d49123acc Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Mon, 10 Feb 2025 08:42:37 +1100 Subject: [PATCH 41/45] chore: refine repo links for subcrates --- zarrs_data_type/Cargo.toml | 2 +- zarrs_filesystem/Cargo.toml | 2 +- zarrs_http/Cargo.toml | 2 +- zarrs_metadata/Cargo.toml | 2 +- zarrs_object_store/Cargo.toml | 2 +- zarrs_opendal/Cargo.toml | 2 +- zarrs_storage/Cargo.toml | 2 +- zarrs_zip/Cargo.toml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/zarrs_data_type/Cargo.toml b/zarrs_data_type/Cargo.toml index 921a13bf..cc028d01 100644 --- a/zarrs_data_type/Cargo.toml +++ b/zarrs_data_type/Cargo.toml @@ -7,7 +7,7 @@ rust-version = "1.77" description = "Zarr data types for the zarrs crate" homepage = "https://zarrs.dev" documentation = "https://docs.rs/zarrs_data_type" -repository = "https://github.com/LDeakin/zarrs" +repository = "https://github.com/LDeakin/zarrs/tree/main/zarrs_data_type" license = "MIT OR Apache-2.0" keywords = ["zarr", "zarrs"] categories = ["encoding"] diff --git a/zarrs_filesystem/Cargo.toml b/zarrs_filesystem/Cargo.toml index fabd2bba..5dd882d5 100644 --- a/zarrs_filesystem/Cargo.toml +++ b/zarrs_filesystem/Cargo.toml @@ -7,7 +7,7 @@ rust-version = "1.77" description = "A filesystem store for the zarrs crate" homepage = "https://zarrs.dev" documentation = "https://docs.rs/zarrs_filesystem" -repository = "https://github.com/LDeakin/zarrs" +repository = "https://github.com/LDeakin/zarrs/tree/main/zarrs_filesystem" license = "MIT OR Apache-2.0" keywords = ["zarr", "zarrs", "storage", "store", "filesystem"] categories = ["encoding"] diff --git a/zarrs_http/Cargo.toml b/zarrs_http/Cargo.toml index 008381a4..70c22c93 100644 --- a/zarrs_http/Cargo.toml +++ b/zarrs_http/Cargo.toml @@ -7,7 +7,7 @@ rust-version = "1.77" description = "A synchronous http store for the zarrs crate" homepage = "https://zarrs.dev" documentation = "https://docs.rs/zarrs_http" -repository = "https://github.com/LDeakin/zarrs" +repository = "https://github.com/LDeakin/zarrs/tree/main/zarrs_http" license = "MIT OR Apache-2.0" keywords = ["zarr", "zarrs", "http"] categories = ["encoding"] diff --git a/zarrs_metadata/Cargo.toml b/zarrs_metadata/Cargo.toml index 9e6effa7..5918888d 100644 --- a/zarrs_metadata/Cargo.toml +++ b/zarrs_metadata/Cargo.toml @@ -7,7 +7,7 @@ rust-version = "1.77" description = "Zarr metadata support for the zarrs crate" homepage = "https://zarrs.dev" documentation = "https://docs.rs/zarrs_metadata" -repository = "https://github.com/LDeakin/zarrs" +repository = "https://github.com/LDeakin/zarrs/tree/main/zarrs_metadata" license = "MIT OR Apache-2.0" keywords = ["zarr", "zarrs", "metadata"] categories = ["encoding"] diff --git a/zarrs_object_store/Cargo.toml b/zarrs_object_store/Cargo.toml index 7089eb66..977c62f7 100644 --- a/zarrs_object_store/Cargo.toml +++ b/zarrs_object_store/Cargo.toml @@ -7,7 +7,7 @@ rust-version = "1.77" description = "object_store store support for the zarrs crate" homepage = "https://zarrs.dev" documentation = "https://docs.rs/zarrs_object_store" -repository = "https://github.com/LDeakin/zarrs" +repository = "https://github.com/LDeakin/zarrs/tree/main/zarrs_object_store" license = "MIT OR Apache-2.0" keywords = ["zarr", "zarrs", "storage", "store"] categories = ["encoding"] diff --git a/zarrs_opendal/Cargo.toml b/zarrs_opendal/Cargo.toml index a7a39c85..cd93d7c0 100644 --- a/zarrs_opendal/Cargo.toml +++ b/zarrs_opendal/Cargo.toml @@ -7,7 +7,7 @@ rust-version = "1.77" description = "opendal store support for the zarrs crate" homepage = "https://zarrs.dev" documentation = "https://docs.rs/zarrs_opendal" -repository = "https://github.com/LDeakin/zarrs" +repository = "https://github.com/LDeakin/zarrs/tree/main/zarrs_opendal" license = "MIT OR Apache-2.0" keywords = ["zarr", "zarrs", "storage", "store"] categories = ["encoding"] diff --git a/zarrs_storage/Cargo.toml b/zarrs_storage/Cargo.toml index a14d9931..7c8d619a 100644 --- a/zarrs_storage/Cargo.toml +++ b/zarrs_storage/Cargo.toml @@ -7,7 +7,7 @@ rust-version = "1.77" description = "The storage API and default stores for the zarrs crate" homepage = "https://zarrs.dev" documentation = "https://docs.rs/zarrs_storage" -repository = "https://github.com/LDeakin/zarrs" +repository = "https://github.com/LDeakin/zarrs/tree/main/zarrs_storage" license = "MIT OR Apache-2.0" keywords = ["zarr", "zarrs", "storage", "store"] categories = ["encoding"] diff --git a/zarrs_zip/Cargo.toml b/zarrs_zip/Cargo.toml index 14fefe91..1ca195fa 100644 --- a/zarrs_zip/Cargo.toml +++ b/zarrs_zip/Cargo.toml @@ -7,7 +7,7 @@ rust-version = "1.77" description = "A storage adapter for zip files for the zarrs crate" homepage = "https://zarrs.dev" documentation = "https://docs.rs/zarrs_zip" -repository = "https://github.com/LDeakin/zarrs" +repository = "https://github.com/LDeakin/zarrs/tree/main/zarrs_zip" license = "MIT OR Apache-2.0" keywords = ["zarr", "zarrs", "storage", "store", "zip"] categories = ["encoding"] From 8fbfff945c6c70fe68cb83fd0b8c5e4f07a3a34b Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Thu, 13 Feb 2025 10:38:21 +1100 Subject: [PATCH 42/45] fix(metadata): Make `AdditionalField` public and permit any JSON type (#144) * fix(metadata): Make `AdditionalField` public and permit any JSON type * revise UnsupportedAdditionalFieldError future change notes * add `UnsupportedAdditionalFieldError::new` --- zarrs/src/array/array_builder.rs | 3 +- zarrs/src/group.rs | 16 ++- zarrs/src/group/group_builder.rs | 3 +- zarrs_metadata/CHANGELOG.md | 6 ++ zarrs_metadata/src/lib.rs | 58 +++++++---- zarrs_metadata/src/v3.rs | 4 +- zarrs_metadata/src/v3/metadata.rs | 124 +++++++++++++++++------ zarrs_metadata/tests/extensions_zep_9.rs | 56 ++++++++++ 8 files changed, 208 insertions(+), 62 deletions(-) create mode 100644 zarrs_metadata/tests/extensions_zep_9.rs diff --git a/zarrs/src/array/array_builder.rs b/zarrs/src/array/array_builder.rs index 7ea6d8ba..656f1e87 100644 --- a/zarrs/src/array/array_builder.rs +++ b/zarrs/src/array/array_builder.rs @@ -232,8 +232,7 @@ impl ArrayBuilder { /// Set additional fields not defined in the Zarr specification. /// Use this cautiously. In general, store user defined attributes using [`ArrayBuilder::attributes`]. /// - /// Note that array metadata must not contain any additional fields, unless they are annotated with `"must_understand": false`. - /// `zarrs` will error when opening an array with additional fields without this annotation. + /// `zarrs` and other implementations are expected to error when opening an array with unsupported additional fields, unless they are a JSON object containing `"must_understand": false`. pub fn additional_fields(&mut self, additional_fields: AdditionalFields) -> &mut Self { self.additional_fields = additional_fields; self diff --git a/zarrs/src/group.rs b/zarrs/src/group.rs index 54d04380..f3e8faf0 100644 --- a/zarrs/src/group.rs +++ b/zarrs/src/group.rs @@ -754,8 +754,8 @@ mod tests { } #[test] - fn group_metadata_invalid_additional_field() { - let group_metadata = serde_json::from_str::( + fn group_metadata_unknown_additional_field() { + let group_metadata = serde_json::from_str::( r#"{ "zarr_format": 3, "node_type": "group", @@ -763,10 +763,16 @@ mod tests { "spam": "ham", "eggs": 42 }, - "unknown": "fail" + "unknown": "unsupported" }"#, - ); - assert!(group_metadata.is_err()); + ) + .unwrap(); + assert!(group_metadata.additional_fields.len() == 1); + assert!(group_metadata + .additional_fields + .get("unknown") + .unwrap() + .must_understand()); } #[test] diff --git a/zarrs/src/group/group_builder.rs b/zarrs/src/group/group_builder.rs index 36d13abd..8b0206c8 100644 --- a/zarrs/src/group/group_builder.rs +++ b/zarrs/src/group/group_builder.rs @@ -44,8 +44,7 @@ impl GroupBuilder { /// Set additional fields not defined in the Zarr specification. /// Use this cautiously. In general, store user defined attributes using [`GroupBuilder::attributes`]. /// - /// Note that array metadata must not contain any additional fields, unless they are annotated with `"must_understand": false`. - /// `zarrs` will error when opening an array with additional fields without this annotation. + /// `zarrs` and other implementations are expected to error when opening a group with unsupported additional fields, unless they are a JSON object containing `"must_understand": false`. pub fn additional_fields(&mut self, additional_fields: AdditionalFields) -> &mut Self { match &mut self.metadata { GroupMetadata::V3(metadata) => metadata.additional_fields = additional_fields, diff --git a/zarrs_metadata/CHANGELOG.md b/zarrs_metadata/CHANGELOG.md index 45712372..8f52bc25 100644 --- a/zarrs_metadata/CHANGELOG.md +++ b/zarrs_metadata/CHANGELOG.md @@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- Add `UnsupportedAdditionalFieldError::new` + +### Fixed +- Make `AdditionalField` public and permit any JSON type (not just objects) + ## [0.3.3] - 2025-02-06 ### Fixed diff --git a/zarrs_metadata/src/lib.rs b/zarrs_metadata/src/lib.rs index fadc3b2e..16577efd 100644 --- a/zarrs_metadata/src/lib.rs +++ b/zarrs_metadata/src/lib.rs @@ -75,7 +75,7 @@ pub enum NodeMetadata { #[cfg(test)] mod tests { use super::*; - use v3::{AdditionalFields, MetadataV3}; + use v3::{AdditionalField, AdditionalFields, MetadataV3}; #[test] fn metadata() { @@ -111,14 +111,27 @@ mod tests { } #[test] - fn additional_fields_auto() { - let mut additional_fields = AdditionalFields::new(); + fn additional_fields_constructors() { let additional_field = serde_json::Map::new(); - additional_fields.insert("key".to_string(), additional_field.into()); - assert!(!additional_fields.contains_key("must_understand")); - assert!(serde_json::to_string(&additional_fields) - .unwrap() - .contains(r#""must_understand":false"#)); + let additional_field: AdditionalField = additional_field.into(); + assert!(additional_field.must_understand()); + assert!( + additional_field.as_value() == &serde_json::Value::Object(serde_json::Map::default()) + ); + assert!(serde_json::to_string(&additional_field).unwrap() == r#"{"must_understand":true}"#); + + let additional_field: AdditionalField = AdditionalField::new("test", true); + assert!(additional_field.must_understand()); + assert!(additional_field.as_value() == &serde_json::Value::String("test".to_string())); + assert!(serde_json::to_string(&additional_field).unwrap() == r#""test""#); + + let additional_field: AdditionalField = AdditionalField::new(123, false); + assert!(!additional_field.must_understand()); + assert!( + additional_field.as_value() + == &serde_json::Value::Number(serde_json::Number::from(123)) + ); + assert!(serde_json::to_string(&additional_field).unwrap() == "123"); } #[test] @@ -127,20 +140,23 @@ mod tests { "unknown_field": { "key": "value", "must_understand": false - } - }"#; - let additional_fields = serde_json::from_str::(json); - assert!(additional_fields.is_ok()); - } - - #[test] - fn additional_fields_invalid() { - let json = r#"{ - "unknown_field": { + }, + "unsupported_field_1": { + "key": "value", + "must_understand": true + }, + "unsupported_field_2": { "key": "value" - } + }, + "unsupported_field_3": [], + "unsupported_field_4": "test" }"#; - let additional_fields = serde_json::from_str::(json); - assert!(additional_fields.is_err()); + let additional_fields = serde_json::from_str::(json).unwrap(); + assert!(additional_fields.len() == 5); + assert!(!additional_fields["unknown_field"].must_understand()); + assert!(additional_fields["unsupported_field_1"].must_understand()); + assert!(additional_fields["unsupported_field_2"].must_understand()); + assert!(additional_fields["unsupported_field_3"].must_understand()); + assert!(additional_fields["unsupported_field_4"].must_understand()); } } diff --git a/zarrs_metadata/src/v3.rs b/zarrs_metadata/src/v3.rs index d67b0b01..960e3c58 100644 --- a/zarrs_metadata/src/v3.rs +++ b/zarrs_metadata/src/v3.rs @@ -9,8 +9,8 @@ pub use group::GroupMetadataV3; mod metadata; pub use metadata::{ - AdditionalFields, ConfigurationInvalidError, MetadataConfiguration, MetadataV3, - UnsupportedAdditionalFieldError, + AdditionalField, AdditionalFields, ConfigurationInvalidError, MetadataConfiguration, + MetadataV3, UnsupportedAdditionalFieldError, }; /// V3 node metadata ([`ArrayMetadataV3`] or [`GroupMetadataV3`]). diff --git a/zarrs_metadata/src/v3/metadata.rs b/zarrs_metadata/src/v3/metadata.rs index edcafa6d..bcc4279f 100644 --- a/zarrs_metadata/src/v3/metadata.rs +++ b/zarrs_metadata/src/v3/metadata.rs @@ -1,5 +1,6 @@ use derive_more::From; use serde::{de::DeserializeOwned, ser::SerializeMap, Deserialize, Serialize}; +use serde_json::Value; use thiserror::Error; /// Metadata with a name and optional configuration. @@ -33,7 +34,7 @@ pub struct MetadataV3 { } /// Configuration metadata. -pub type MetadataConfiguration = serde_json::Map; +pub type MetadataConfiguration = serde_json::Map; impl TryFrom<&str> for MetadataV3 { type Error = serde_json::Error; @@ -138,7 +139,7 @@ impl MetadataV3 { configuration: &TConfiguration, ) -> Result { let configuration = serde_json::to_value(configuration)?; - if let serde_json::Value::Object(configuration) = configuration { + if let Value::Object(configuration) = configuration { Ok(Self::new_with_configuration(name, configuration)) } else { Err(serde::ser::Error::custom( @@ -212,6 +213,7 @@ impl ConfigurationInvalidError { } } +// FIXME: Move to `zarrs` itself in 0.4.0 /// An unsupported additional field error. /// /// An unsupported field in array or group metadata is an unrecognised field without `"must_understand": false`. @@ -219,10 +221,16 @@ impl ConfigurationInvalidError { #[error("unsupported additional field {name} with value {value}")] pub struct UnsupportedAdditionalFieldError { name: String, - value: serde_json::Value, + value: Value, } impl UnsupportedAdditionalFieldError { + /// Create a new [`UnsupportedAdditionalFieldError`]. + #[must_use] + pub fn new(name: String, value: Value) -> UnsupportedAdditionalFieldError { + Self { name, value } + } + /// Return the name of the unsupported additional field. #[must_use] pub fn name(&self) -> &str { @@ -231,54 +239,110 @@ impl UnsupportedAdditionalFieldError { /// Return the value of the unsupported additional field. #[must_use] - pub const fn value(&self) -> &serde_json::Value { + pub const fn value(&self) -> &Value { &self.value } } /// An additional field in array or group metadata. /// -/// Must be an object with a `"must_understand": false` field. -#[derive(Serialize, Deserialize, Clone, Eq, PartialEq, Debug, Default, From)] +/// A field that is not recognised / supported by `zarrs` will be considered an additional field. +/// Additional fields can be any JSON type. +/// An array / group cannot be created with an additional field, unless the additional field is an object with a `"must_understand": false` field. +/// +/// ### Example additional field JSON +/// ```json +// "unknown_field": { +// "key": "value", +// "must_understand": false +// }, +// "unsupported_field_1": { +// "key": "value", +// "must_understand": true +// }, +// "unsupported_field_2": { +// "key": "value" +// }, +// "unsupported_field_3": [], +// "unsupported_field_4": "test" +/// ``` +#[derive(Clone, Eq, PartialEq, Debug, Default)] pub struct AdditionalField { - must_understand: monostate::MustBe!(false), - #[serde(flatten)] - fields: serde_json::Map, + field: Value, + must_understand: bool, } impl AdditionalField { - /// Return the underlying map. + /// Create a new additional field. + #[must_use] + pub fn new(field: impl Into, must_understand: bool) -> AdditionalField { + Self { + field: field.into(), + must_understand, + } + } + + /// Return the underlying value. + #[must_use] + pub const fn as_value(&self) -> &Value { + &self.field + } + + /// Return the `must_understand` component of the additional field. #[must_use] - pub const fn as_map(&self) -> &serde_json::Map { - &self.fields + pub const fn must_understand(&self) -> bool { + self.must_understand } } -impl From for serde_json::Map { - fn from(value: AdditionalField) -> Self { - value.fields +impl Serialize for AdditionalField { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + match &self.field { + Value::Object(object) => { + let mut map = serializer.serialize_map(Some(object.len() + 1))?; + map.serialize_entry("must_understand", &Value::Bool(self.must_understand))?; + for (k, v) in object { + map.serialize_entry(k, v)?; + } + map.end() + } + _ => self.field.serialize(serializer), + } + } +} + +impl<'de> serde::Deserialize<'de> for AdditionalField { + fn deserialize>(d: D) -> Result { + let value = Value::deserialize(d)?; + Ok(value.into()) } } -impl From> for AdditionalField { - fn from(value: serde_json::Map) -> Self { +impl From for AdditionalField +where + T: Into, +{ + fn from(field: T) -> Self { + let mut value: Value = field.into(); + let must_understand = if let Some(object) = value.as_object_mut() { + if let Some(Value::Bool(must_understand)) = object.remove("must_understand") { + must_understand + } else { + true + } + } else { + true + }; Self { - must_understand: monostate::MustBe!(false), - fields: value, + must_understand, + field: value, } } } /// Additional fields in array or group metadata. -/// -/// Additional fields are a JSON object with a `"must_understand": false` key-value pair. -/// -/// ### Example additional field JSON -/// ```json -/// "unknown_field": { -/// "key": "value", -/// "must_understand": false -/// } -/// ``` -// NOTE: It would be nice if this was just a serde_json::Map, but it only has implementations for ``. +// NOTE: It would be nice if this was just a serde_json::Map, but it only has implementations for ``. pub type AdditionalFields = std::collections::BTreeMap; diff --git a/zarrs_metadata/tests/extensions_zep_9.rs b/zarrs_metadata/tests/extensions_zep_9.rs new file mode 100644 index 00000000..4c623844 --- /dev/null +++ b/zarrs_metadata/tests/extensions_zep_9.rs @@ -0,0 +1,56 @@ +#![allow(missing_docs)] + +use zarrs_metadata::v3::ArrayMetadataV3; + +#[test] +fn array_extensions() { + let json = r#"{ + "zarr_format": 3, + "node_type": "array", + "data_type": "https://example.com/zarr/string", + "fill_value": "", + "chunk_key_encoding": { + "name": "default", + "configuration": { "separator": "." } + }, + "codecs": [ + { + "name": "https://numcodecs.dev/vlen-utf8" + }, + { + "name": "zstd", + "configuration": {} + } + ], + "chunk_grid": { + "name": "regular", + "configuration": { "chunk_shape": [ 32 ] } + }, + "shape": [ 128 ], + "dimension_names": [ "x" ], + "attributes": {}, + "storage_transformers": [], + "extensions": [ + { + "name": "https://example.com/zarr/offset", + "configuration": { "offset": [ 12 ] } + }, + { + "name": "https://example.com/zarr/array-statistics", + "configuration": { + "min": 5, + "max": 12 + }, + "must_understand": false + }, + { + "name": "https://example.com/zarr/consolidated-metadata", + "configuration": {}, + "must_understand": false + } + ] +}"#; + + let metadata: ArrayMetadataV3 = serde_json::from_str(&json).unwrap(); + assert_eq!(metadata.data_type.name(), "https://example.com/zarr/string"); +} From e3bc74308e7546616f870c4ecceb94ac1680c14e Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Thu, 13 Feb 2025 10:39:30 +1100 Subject: [PATCH 43/45] zarrs_metadata: prepare 0.3.4 release --- Cargo.toml | 2 +- zarrs_metadata/CHANGELOG.md | 5 ++++- zarrs_metadata/Cargo.toml | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 00653ae8..dfbcfaf6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ version = "0.1.0" path = "zarrs_data_type" [workspace.dependencies.zarrs_metadata] -version = "0.3.0" +version = "0.3.4" path = "zarrs_metadata" [workspace.dependencies.zarrs_storage] diff --git a/zarrs_metadata/CHANGELOG.md b/zarrs_metadata/CHANGELOG.md index 8f52bc25..26c9a3c6 100644 --- a/zarrs_metadata/CHANGELOG.md +++ b/zarrs_metadata/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.4] - 2025-02-13 + ### Added - Add `UnsupportedAdditionalFieldError::new` @@ -84,7 +86,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Initial release - Split from the `metadata` module of `zarrs` 0.17.0-dev -[unreleased]: https://github.com/LDeakin/zarrs/compare/zarrs_metadata-v0.3.3...HEAD +[unreleased]: https://github.com/LDeakin/zarrs/compare/zarrs_metadata-v0.3.4...HEAD +[0.3.4]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.3.4 [0.3.3]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.3.3 [0.3.2]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.3.2 [0.3.1]: https://github.com/LDeakin/zarrs/releases/tag/zarrs_metadata-v0.3.1 diff --git a/zarrs_metadata/Cargo.toml b/zarrs_metadata/Cargo.toml index 5918888d..2efac0da 100644 --- a/zarrs_metadata/Cargo.toml +++ b/zarrs_metadata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "zarrs_metadata" -version = "0.3.3" +version = "0.3.4" authors = ["Lachlan Deakin "] edition = "2021" rust-version = "1.77" From 3ccff0f8c9ba3298750b27f702112a477d9dffab Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Thu, 13 Feb 2025 10:53:23 +1100 Subject: [PATCH 44/45] chore(CI): run on version branches --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 57a3c7d6..cee9dacb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,9 +2,9 @@ name: build on: push: - branches: ["main", "v[0-9]+.[0-9]+"] + branches: ["main", "v[0-9]+.[0-9]+.x"] pull_request: - branches: ["main", "v[0-9]+.[0-9]+"] + branches: ["main", "v[0-9]+.[0-9]+.x"] env: CARGO_TERM_COLOR: always From 4775e844da31bc7e789545293b14532c1d6639bc Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Thu, 13 Feb 2025 11:02:53 +1100 Subject: [PATCH 45/45] chore: add 0.19.2 to changelog --- CHANGELOG.md | 9 ++++++++- CITATION.cff | 4 ++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3875821f..111fe991 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Breaking**: move the `zarrs::array::{data_type,fill_value}` modules into the `zarrs_data_type` crate - Bump `lru` to 0.13 +## [0.19.2] - 2025-02-13 + +### Changed +- Bump `zarrs_metadata` to 0.3.4 which includes a number of Zarr metadata fixes + - See the [`zarrs_metadata` CHANGELOG.md](https://github.com/LDeakin/zarrs/blob/main/zarrs_metadata/CHANGELOG.md) + ## [0.19.1] - 2025-01-19 ### Added @@ -1252,7 +1258,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Initial public release -[unreleased]: https://github.com/LDeakin/zarrs/compare/zarrs-v0.19.1...HEAD +[unreleased]: https://github.com/LDeakin/zarrs/compare/zarrs-v0.19.2...HEAD +[0.19.2]: https://github.com/LDeakin/zarrs/releases/tag/zarrs-v0.19.2 [0.19.1]: https://github.com/LDeakin/zarrs/releases/tag/zarrs-v0.19.1 [0.19.0]: https://github.com/LDeakin/zarrs/releases/tag/zarrs-v0.19.0 [0.18.3]: https://github.com/LDeakin/zarrs/releases/tag/zarrs-v0.18.3 diff --git a/CITATION.cff b/CITATION.cff index 36903046..86bd8679 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,8 +1,8 @@ cff-version: 1.2.0 message: "If you use this software, please cite it as below." title: "zarrs" -version: 0.19.1 -date-released: 2025-01-19 +version: 0.19.2 +date-released: 2025-02-13 repository-code: "https://github.com/LDeakin/zarrs" url: "https://zarrs.dev" abstract: "zarrs is a Rust library for the Zarr storage format for multidimensional arrays and metadata."