diff --git a/CHANGELOG.md b/CHANGELOG.md index 606b7eef..acc6b1a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,7 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Breaking**: change `ArraySubset::inbounds` to take another subset rather than a shape - **Breaking**: `CodecError` enum changes: - Change `CodecError::UnexpectedChunkDecodedSize` to an `InvalidBytesLengthError` - - Add `CodecError::{InvalidArrayShape,InvalidNumberOfElements,SubsetOutOfBounds}` + - Add `CodecError::{InvalidArrayShape,InvalidNumberOfElements,SubsetOutOfBounds,RawBytesOffsetsCreate,RawBytesOffsetsOutOfBounds}` - **Breaking**: Change output args to `ArrayBytesFixedDisjointView` and make safe the following: - `Array::[async_]retrieve_chunk[_subset]_into` - `[Async]ArrayPartialDecoderTraits::partial_decode_into` @@ -24,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `zarrs::array::copy_fill_value_into` - `zarrs::array::update_array_bytes` - **Breaking**: change `RawBytesOffsets` into a validated newtype +- **Breaking**: `ArrayBytes::new_vlen()` not returns a `Result` and validates bytes/offsets compatibility ## [0.19.1] - 2025-01-19 diff --git a/zarrs/src/array.rs b/zarrs/src/array.rs index 1f7b17f6..0d5cd65a 100644 --- a/zarrs/src/array.rs +++ b/zarrs/src/array.rs @@ -49,7 +49,7 @@ pub use self::{ array_builder::ArrayBuilder, array_bytes::{ copy_fill_value_into, update_array_bytes, ArrayBytes, ArrayBytesError, RawBytes, - RawBytesOffsets, RawBytesOffsetsCreateError, + RawBytesOffsets, RawBytesOffsetsCreateError, RawBytesOffsetsOutOfBoundsError, }, array_bytes_fixed_disjoint_view::{ ArrayBytesFixedDisjointView, ArrayBytesFixedDisjointViewCreateError, diff --git a/zarrs/src/array/array_bytes.rs b/zarrs/src/array/array_bytes.rs index 498d66b2..0bfa9a10 100644 --- a/zarrs/src/array/array_bytes.rs +++ b/zarrs/src/array/array_bytes.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; +use derive_more::derive::Display; use itertools::Itertools; use thiserror::Error; use unsafe_cell_slice::UnsafeCellSlice; @@ -38,9 +39,18 @@ pub enum ArrayBytes<'a> { /// The bytes and offsets are modeled on the [Apache Arrow Variable-size Binary Layout](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout). /// - The offsets buffer contains length + 1 ~~signed integers (either 32-bit or 64-bit, depending on the data type)~~ usize integers. /// - Offsets must be monotonically increasing, that is `offsets[j+1] >= offsets[j]` for `0 <= j < length`, even for null slots. Thus, the bytes represent C-contiguous elements with padding permitted. + /// - The final offset must be less than or equal to the length of the bytes buffer. Variable(RawBytes<'a>, RawBytesOffsets<'a>), } +/// An error raised if variable length array bytes offsets are out of bounds. +#[derive(Debug, Error, Display)] +#[display("Offset {offset} is out of bounds for bytes of length {len}")] +pub struct RawBytesOffsetsOutOfBoundsError { + offset: usize, + len: usize, +} + /// Errors related to [`ArrayBytes<'_>`] and [`ArrayBytes`]. #[derive(Debug, Error)] pub enum ArrayBytesError { @@ -58,8 +68,40 @@ impl<'a> ArrayBytes<'a> { } /// Create a new variable length array bytes from `bytes` and `offsets`. - pub fn new_vlen(bytes: impl Into>, offsets: RawBytesOffsets<'a>) -> Self { - Self::Variable(bytes.into(), offsets) + /// + /// # Errors + /// Returns a [`RawBytesOffsetsOutOfBoundsError`] if the last offset is out of bounds of the bytes. + pub fn new_vlen( + bytes: impl Into>, + offsets: RawBytesOffsets<'a>, + ) -> Result { + let bytes = bytes.into(); + match offsets.last() { + Some(&last) => { + if last <= bytes.len() { + Ok(Self::Variable(bytes, offsets)) + } else { + Err(RawBytesOffsetsOutOfBoundsError { + offset: last, + len: bytes.len(), + }) + } + } + None => Err(RawBytesOffsetsOutOfBoundsError { offset: 0, len: 0 }), + } + } + + /// Create a new variable length array bytes from `bytes` and `offsets` without checking the offsets. + /// + /// # Safety + /// The last offset must be less than or equal to the length of the bytes. + pub unsafe fn new_vlen_unchecked( + bytes: impl Into>, + offsets: RawBytesOffsets<'a>, + ) -> Self { + let bytes = bytes.into(); + debug_assert!(offsets.last().is_some_and(|&last| last <= bytes.len())); + Self::Variable(bytes, offsets) } /// Create a new [`ArrayBytes`] with `num_elements` composed entirely of the `fill_value`. @@ -78,14 +120,18 @@ impl<'a> ArrayBytes<'a> { } ArraySize::Variable { num_elements } => { let num_elements = usize::try_from(num_elements).unwrap(); - Self::new_vlen(fill_value.as_ne_bytes().repeat(num_elements), unsafe { + let offsets = unsafe { // SAFETY: The offsets are monotonically increasing. RawBytesOffsets::new_unchecked( (0..=num_elements) .map(|i| i * fill_value.size()) .collect::>(), ) - }) + }; + unsafe { + // SAFETY: The last offset is equal to the length of the bytes + Self::new_vlen_unchecked(fill_value.as_ne_bytes().repeat(num_elements), offsets) + } } } } @@ -135,9 +181,9 @@ impl<'a> ArrayBytes<'a> { #[must_use] pub fn into_owned<'b>(self) -> ArrayBytes<'b> { match self { - Self::Fixed(bytes) => ArrayBytes::<'b>::new_flen(bytes.into_owned()), + Self::Fixed(bytes) => ArrayBytes::<'b>::Fixed(bytes.into_owned().into()), Self::Variable(bytes, offsets) => { - ArrayBytes::<'b>::new_vlen(bytes.into_owned(), offsets.into_owned()) + ArrayBytes::<'b>::Variable(bytes.into_owned().into(), offsets.into_owned()) } } } @@ -206,7 +252,11 @@ impl<'a> ArrayBytes<'a> { // SAFETY: The offsets are monotonically increasing. RawBytesOffsets::new_unchecked(ss_offsets) }; - Ok(ArrayBytes::new_vlen(ss_bytes, ss_offsets)) + let array_bytes = unsafe { + // SAFETY: The last offset is equal to the length of the bytes + ArrayBytes::new_vlen_unchecked(ss_bytes, ss_offsets) + }; + Ok(array_bytes) } ArrayBytes::Fixed(bytes) => { let byte_ranges = @@ -337,8 +387,11 @@ pub(crate) fn update_bytes_vlen<'a>( // SAFETY: The offsets are monotonically increasing. RawBytesOffsets::new_unchecked(offsets_new) }; - - Ok(ArrayBytes::new_vlen(bytes_new, offsets_new)) + let array_bytes = unsafe { + // SAFETY: The last offset is equal to the length of the bytes + ArrayBytes::new_vlen_unchecked(bytes_new, offsets_new) + }; + Ok(array_bytes) } /// Update a subset of an array. @@ -462,7 +515,12 @@ pub(crate) fn merge_chunks_vlen<'a>( } } - Ok(ArrayBytes::new_vlen(bytes, offsets)) + let array_bytes = unsafe { + // SAFETY: The last offset is equal to the length of the bytes + ArrayBytes::new_vlen_unchecked(bytes, offsets) + }; + + Ok(array_bytes) } pub(crate) fn extract_decoded_regions_vlen<'a>( @@ -496,7 +554,11 @@ pub(crate) fn extract_decoded_regions_vlen<'a>( // SAFETY: The offsets are monotonically increasing. RawBytesOffsets::new_unchecked(region_offsets) }; - out.push(ArrayBytes::new_vlen(region_bytes, region_offsets)); + let array_bytes = unsafe { + // SAFETY: The last offset is equal to the length of the bytes + ArrayBytes::new_vlen_unchecked(region_bytes, region_offsets) + }; + out.push(array_bytes); } Ok(out) } diff --git a/zarrs/src/array/codec.rs b/zarrs/src/array/codec.rs index a520f2c5..f5adeed0 100644 --- a/zarrs/src/array/codec.rs +++ b/zarrs/src/array/codec.rs @@ -97,6 +97,7 @@ use std::any::Any; use std::borrow::Cow; use std::sync::Arc; +use super::RawBytesOffsetsOutOfBoundsError; use super::{ array_bytes::RawBytesOffsetsCreateError, concurrency::RecommendedConcurrency, ArrayBytes, ArrayBytesFixedDisjointView, BytesRepresentation, ChunkRepresentation, ChunkShape, DataType, @@ -1063,6 +1064,9 @@ pub enum CodecError { /// Invalid byte offsets for variable length data. #[error(transparent)] RawBytesOffsetsCreate(#[from] RawBytesOffsetsCreateError), + /// Variable length array bytes offsets are out of bounds. + #[error(transparent)] + RawBytesOffsetsOutOfBounds(#[from] RawBytesOffsetsOutOfBoundsError), } impl From<&str> for CodecError { diff --git a/zarrs/src/array/codec/array_to_array/transpose.rs b/zarrs/src/array/codec/array_to_array/transpose.rs index 0276fd76..8e0f1ec5 100644 --- a/zarrs/src/array/codec/array_to_array/transpose.rs +++ b/zarrs/src/array/codec/array_to_array/transpose.rs @@ -124,8 +124,11 @@ fn transpose_vlen<'a>( // SAFETY: The offsets are monotonically increasing. RawBytesOffsets::new_unchecked(offsets_new) }; - - ArrayBytes::new_vlen(bytes_new, offsets_new) + let array_bytes = unsafe { + // SAFETY: The last offset is equal to the length of the bytes + ArrayBytes::new_vlen_unchecked(bytes_new, offsets_new) + }; + array_bytes } #[cfg(test)] diff --git a/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs b/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs index 28956d89..9e0a23a3 100644 --- a/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs +++ b/zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs @@ -265,7 +265,7 @@ impl ArrayToBytesCodecTraits for VlenCodec { } } .unwrap(); - let (data, offsets) = super::get_vlen_bytes_and_offsets( + let (bytes, offsets) = super::get_vlen_bytes_and_offsets( &index_chunk_rep, &bytes, &self.index_codecs, @@ -273,8 +273,8 @@ impl ArrayToBytesCodecTraits for VlenCodec { options, )?; let offsets = RawBytesOffsets::new(offsets)?; - - Ok(ArrayBytes::new_vlen(data, offsets)) + let array_bytes = ArrayBytes::new_vlen(bytes, offsets)?; + Ok(array_bytes) } fn partial_decoder( diff --git a/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs b/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs index 0970a8fa..e0bcbef3 100644 --- a/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs +++ b/zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs @@ -112,7 +112,8 @@ impl ArrayToBytesCodecTraits for VlenV2Codec { let num_elements = decoded_representation.num_elements_usize(); let (bytes, offsets) = super::get_interleaved_bytes_and_offsets(num_elements, &bytes)?; let offsets = RawBytesOffsets::new(offsets)?; - Ok(ArrayBytes::new_vlen(bytes, offsets)) + let array_bytes = ArrayBytes::new_vlen(bytes, offsets)?; + Ok(array_bytes) } fn partial_decoder( diff --git a/zarrs/src/array/element.rs b/zarrs/src/array/element.rs index b2949898..aa826358 100644 --- a/zarrs/src/array/element.rs +++ b/zarrs/src/array/element.rs @@ -196,7 +196,11 @@ macro_rules! impl_element_string { for element in elements { bytes.extend_from_slice(element.as_bytes()); } - Ok(ArrayBytes::new_vlen(bytes, offsets)) + let array_bytes = unsafe { + // SAFETY: The last offset is the length of the bytes. + ArrayBytes::new_vlen_unchecked(bytes, offsets) + }; + Ok(array_bytes) } } }; @@ -252,7 +256,11 @@ macro_rules! impl_element_binary { // Concatenate bytes let bytes = elements.concat(); - Ok(ArrayBytes::new_vlen(bytes, offsets)) + let array_bytes = unsafe { + // SAFETY: The last offset is the length of the bytes. + ArrayBytes::new_vlen_unchecked(bytes, offsets) + }; + Ok(array_bytes) } } };