diff --git a/Cargo.toml b/Cargo.toml index c7b26ad..a832f9e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,9 @@ crate-type = ["cdylib", "rlib"] [dependencies] pyo3 = { version = "0.23.2", features = ["abi3-py311"] } -zarrs = { version = "0.19.0", features = ["async"] } +zarrs = { git = "https://github.com/LDeakin/zarrs.git", rev = "1354a0fb830144146a7bf6b5cff31f1f429a6f63", features = [ + "async", +] } rayon_iter_concurrent_limit = "0.2.0" rayon = "1.10.0" # fix for https://stackoverflow.com/questions/76593417/package-openssl-was-not-found-in-the-pkg-config-search-path @@ -21,8 +23,7 @@ serde_json = "1.0.128" pyo3-stub-gen = "0.6.2" opendal = { version = "0.51.0", features = ["services-http"] } tokio = { version = "1.41.1", features = ["rt-multi-thread"] } -zarrs_opendal = "0.5.0" -zarrs_metadata = "0.3.3" # require recent zarr-python compatibility fixes (remove with zarrs 0.20) +zarrs_opendal = { git = "https://github.com/LDeakin/zarrs.git", rev = "1354a0fb830144146a7bf6b5cff31f1f429a6f63" } [profile.release] lto = true diff --git a/pyproject.toml b/pyproject.toml index 91916c6..0e5cf08 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -99,7 +99,7 @@ select = [ "W", # Warning detected by Pycodestyle "UP", # pyupgrade "I", # isort - "TCH", # manage type checking blocks + "TC", # manage type checking blocks "TID251", # Banned imports "ICN", # Follow import conventions "PTH", # Pathlib instead of os.path diff --git a/src/lib.rs b/src/lib.rs index cae087c..c01759a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,7 +16,8 @@ use rayon_iter_concurrent_limit::iter_concurrent_limit; use unsafe_cell_slice::UnsafeCellSlice; use zarrs::array::codec::{ArrayToBytesCodecTraits, CodecOptions, CodecOptionsBuilder}; use zarrs::array::{ - copy_fill_value_into, update_array_bytes, ArrayBytes, ArraySize, CodecChain, FillValue, + copy_fill_value_into, update_array_bytes, ArrayBytes, ArrayBytesFixedDisjointView, ArraySize, + CodecChain, FillValue, }; use zarrs::array_subset::ArraySubset; use zarrs::metadata::v3::MetadataV3; @@ -107,7 +108,7 @@ impl CodecPipelineImpl { codec_options: &CodecOptions, ) -> PyResult<()> { let array_shape = item.representation().shape_u64(); - if !chunk_subset.inbounds(&array_shape) { + if !chunk_subset.inbounds_shape(&array_shape) { return Err(PyErr::new::(format!( "chunk subset ({chunk_subset}) is out of bounds for array shape ({array_shape:?})" ))); @@ -127,20 +128,14 @@ impl CodecPipelineImpl { let chunk_bytes_old = self.retrieve_chunk_bytes(item, codec_chain, codec_options)?; // Update the chunk - let chunk_bytes_new = unsafe { - // SAFETY: - // - chunk_bytes_old is compatible with the chunk shape and data type size (validated on decoding) - // - chunk_subset is compatible with chunk_subset_bytes and the data type size (validated above) - // - chunk_subset is within the bounds of the chunk shape (validated above) - // - output bytes and output subset bytes are compatible (same data type) - update_array_bytes( - chunk_bytes_old, - &array_shape, - chunk_subset, - &chunk_subset_bytes, - data_type_size, - ) - }; + let chunk_bytes_new = update_array_bytes( + chunk_bytes_old, + &array_shape, + chunk_subset, + &chunk_subset_bytes, + data_type_size, + ) + .map_py_err::()?; // Store the updated chunk self.store_chunk_bytes(item, codec_chain, chunk_bytes_new, codec_options) @@ -270,42 +265,50 @@ impl CodecPipelineImpl { // For variable length data types, need a codepath with non `_into` methods. // Collect all the subsets and copy into value on the Python side? let update_chunk_subset = |item: chunk_item::WithSubset| { + let chunk_item::WithSubset { + item, + subset, + chunk_subset, + } = item; + let mut output_view = unsafe { + // TODO: Is the following correct? + // can we guarantee that when this function is called from Python with arbitrary arguments? + // SAFETY: chunks represent disjoint array subsets + ArrayBytesFixedDisjointView::new( + output, + // TODO: why is data_type in `item`, it should be derived from `output`, no? + item.representation() + .data_type() + .fixed_size() + .ok_or("variable length data type not supported") + .map_py_err::()?, + &output_shape, + subset, + ) + .map_py_err::()? + }; + // See zarrs::array::Array::retrieve_chunk_subset_into - if item.chunk_subset.start().iter().all(|&o| o == 0) - && item.chunk_subset.shape() == item.representation().shape_u64() + if chunk_subset.start().iter().all(|&o| o == 0) + && chunk_subset.shape() == item.representation().shape_u64() { // See zarrs::array::Array::retrieve_chunk_into if let Some(chunk_encoded) = self.stores.get(&item)? { // Decode the encoded data into the output buffer let chunk_encoded: Vec = chunk_encoded.into(); - unsafe { - // SAFETY: - // - output is an array with output_shape elements of the item.representation data type, - // - item.subset is within the bounds of output_shape. - self.codec_chain.decode_into( - Cow::Owned(chunk_encoded), - item.representation(), - &output, - &output_shape, - &item.subset, - &codec_options, - ) - } + self.codec_chain.decode_into( + Cow::Owned(chunk_encoded), + item.representation(), + &mut output_view, + &codec_options, + ) } else { // The chunk is missing, write the fill value - unsafe { - // SAFETY: - // - data type and fill value are confirmed to be compatible when the ChunkRepresentation is created, - // - output is an array with output_shape elements of the item.representation data type, - // - item.subset is within the bounds of output_shape. - copy_fill_value_into( - item.representation().data_type(), - item.representation().fill_value(), - &output, - &output_shape, - &item.subset, - ) - } + copy_fill_value_into( + item.representation().data_type(), + item.representation().fill_value(), + &mut output_view, + ) } } else { let input_handle = Arc::new(self.stores.decoder(&item)?); @@ -314,19 +317,11 @@ impl CodecPipelineImpl { .clone() .partial_decoder(input_handle, item.representation(), &codec_options) .map_py_err::()?; - unsafe { - // SAFETY: - // - output is an array with output_shape elements of the item.representation data type, - // - item.subset is within the bounds of output_shape. - // - item.chunk_subset has the same number of elements as item.subset. - partial_decoder.partial_decode_into( - &item.chunk_subset, - &output, - &output_shape, - &item.subset, - &codec_options, - ) - } + partial_decoder.partial_decode_into( + &chunk_subset, + &mut output_view, + &codec_options, + ) } .map_py_err::() };