Skip to content

Commit

Permalink
Switch to conditional PUT for branch updates (#755)
Browse files Browse the repository at this point in the history
We no longer do the sequence of write-only branch versions.
  • Loading branch information
paraseba authored Feb 20, 2025
1 parent 2d8345c commit 9f51c85
Show file tree
Hide file tree
Showing 86 changed files with 385 additions and 663 deletions.
4 changes: 0 additions & 4 deletions docs/docs/icechunk-python/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,6 @@ It allows you to configure the following parameters:

The threshold for when to inline a chunk into a manifest instead of storing it as a separate object in the storage backend.

### [`unsafe_overwrite_refs`](./reference.md#icechunk.RepositoryConfig.unsafe_overwrite_refs)

Whether to allow overwriting references in the repository.

### [`get_partial_values_concurrency`](./reference.md#icechunk.RepositoryConfig.get_partial_values_concurrency)

The number of concurrent requests to make when getting partial values from storage.
Expand Down
25 changes: 0 additions & 25 deletions icechunk-python/python/icechunk/_icechunk_python.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -570,7 +570,6 @@ class RepositoryConfig:
def __init__(
self,
inline_chunk_threshold_bytes: int | None = None,
unsafe_overwrite_refs: bool | None = None,
get_partial_values_concurrency: int | None = None,
compression: CompressionConfig | None = None,
caching: CachingConfig | None = None,
Expand All @@ -585,8 +584,6 @@ class RepositoryConfig:
----------
inline_chunk_threshold_bytes: int | None
The maximum size of a chunk that will be stored inline in the repository.
unsafe_overwrite_refs: bool | None
Whether to allow overwriting references in the repository.
get_partial_values_concurrency: int | None
The number of concurrent requests to make when getting partial values from storage.
compression: CompressionConfig | None
Expand Down Expand Up @@ -618,28 +615,6 @@ class RepositoryConfig:
"""
...
@property
def unsafe_overwrite_refs(self) -> bool | None:
"""
Whether to allow overwriting references in the repository.
Returns
-------
bool | None
Whether to allow overwriting references in the repository.
"""
...
@unsafe_overwrite_refs.setter
def unsafe_overwrite_refs(self, value: bool | None) -> None:
"""
Set whether to allow overwriting references in the repository.
Parameters
----------
value: bool | None
Whether to allow overwriting references in the repository.
"""
...
@property
def get_partial_values_concurrency(self) -> int | None:
"""
The number of concurrent requests to make when getting partial values from storage.
Expand Down
11 changes: 2 additions & 9 deletions icechunk-python/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -969,8 +969,6 @@ pub struct PyRepositoryConfig {
#[pyo3(get, set)]
pub inline_chunk_threshold_bytes: Option<u16>,
#[pyo3(get, set)]
pub unsafe_overwrite_refs: Option<bool>,
#[pyo3(get, set)]
pub get_partial_values_concurrency: Option<u16>,
#[pyo3(get, set)]
pub compression: Option<Py<PyCompressionConfig>>,
Expand All @@ -996,7 +994,6 @@ impl From<&PyRepositoryConfig> for RepositoryConfig {
fn from(value: &PyRepositoryConfig) -> Self {
Python::with_gil(|py| Self {
inline_chunk_threshold_bytes: value.inline_chunk_threshold_bytes,
unsafe_overwrite_refs: value.unsafe_overwrite_refs,
get_partial_values_concurrency: value.get_partial_values_concurrency,
compression: value.compression.as_ref().map(|c| (&*c.borrow(py)).into()),
caching: value.caching.as_ref().map(|c| (&*c.borrow(py)).into()),
Expand All @@ -1014,7 +1011,6 @@ impl From<RepositoryConfig> for PyRepositoryConfig {
#[allow(clippy::expect_used)]
Python::with_gil(|py| Self {
inline_chunk_threshold_bytes: value.inline_chunk_threshold_bytes,
unsafe_overwrite_refs: value.unsafe_overwrite_refs,
get_partial_values_concurrency: value.get_partial_values_concurrency,
compression: value.compression.map(|c| {
Py::new(py, Into::<PyCompressionConfig>::into(c))
Expand Down Expand Up @@ -1049,11 +1045,10 @@ impl PyRepositoryConfig {
}

#[new]
#[pyo3(signature = (inline_chunk_threshold_bytes = None, unsafe_overwrite_refs = None, get_partial_values_concurrency = None, compression = None, caching = None, storage = None, virtual_chunk_containers = None, manifest = None))]
#[pyo3(signature = (inline_chunk_threshold_bytes = None, get_partial_values_concurrency = None, compression = None, caching = None, storage = None, virtual_chunk_containers = None, manifest = None))]
#[allow(clippy::too_many_arguments)]
pub fn new(
inline_chunk_threshold_bytes: Option<u16>,
unsafe_overwrite_refs: Option<bool>,
get_partial_values_concurrency: Option<u16>,
compression: Option<Py<PyCompressionConfig>>,
caching: Option<Py<PyCachingConfig>>,
Expand All @@ -1063,7 +1058,6 @@ impl PyRepositoryConfig {
) -> Self {
Self {
inline_chunk_threshold_bytes,
unsafe_overwrite_refs,
get_partial_values_concurrency,
compression,
caching,
Expand Down Expand Up @@ -1129,9 +1123,8 @@ impl PyRepositoryConfig {
}));
// TODO: virtual chunk containers
format!(
r#"RepositoryConfig(inline_chunk_threshold_bytes={inl}, unsafe_overwrite_refs={uns}, get_partial_values_concurrency={partial}, compression={comp}, caching={caching}, storage={storage}, manifest={manifest})"#,
r#"RepositoryConfig(inline_chunk_threshold_bytes={inl}, get_partial_values_concurrency={partial}, compression={comp}, caching={caching}, storage={storage}, manifest={manifest})"#,
inl = format_option_to_string(self.inline_chunk_threshold_bytes),
uns = format_option(self.unsafe_overwrite_refs.map(format_bool)),
partial = format_option_to_string(self.get_partial_values_concurrency),
comp = comp,
caching = caching,
Expand Down
3 changes: 2 additions & 1 deletion icechunk-python/src/store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use icechunk::{
manifest::{Checksum, SecondsSinceEpoch, VirtualChunkLocation, VirtualChunkRef},
ChunkLength, ChunkOffset,
},
storage::ETag,
store::{StoreError, StoreErrorKind},
Store,
};
Expand Down Expand Up @@ -37,7 +38,7 @@ enum ChecksumArgument {
impl From<ChecksumArgument> for Checksum {
fn from(value: ChecksumArgument) -> Self {
match value {
ChecksumArgument::String(etag) => Checksum::ETag(etag),
ChecksumArgument::String(etag) => Checksum::ETag(ETag(etag)),
ChecksumArgument::Datetime(date_time) => {
Checksum::LastModified(SecondsSinceEpoch(date_time.timestamp() as u32))
}
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
9 changes: 4 additions & 5 deletions icechunk-python/tests/data/test-repo/config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
inline_chunk_threshold_bytes: 12
unsafe_overwrite_refs: null
get_partial_values_concurrency: null
compression: null
caching: null
Expand All @@ -13,6 +12,10 @@ virtual_chunk_containers:
endpoint_url: https://fly.storage.tigris.dev
anonymous: false
allow_http: false
az:
name: az
url_prefix: az
store: !azure {}
file:
name: file
url_prefix: file
Expand All @@ -21,10 +24,6 @@ virtual_chunk_containers:
name: gcs
url_prefix: gcs
store: !gcs {}
az:
name: az
url_prefix: az
store: !azure {}
s3:
name: s3
url_prefix: s3://
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"snapshot":"NXH3M0HJ7EEJ0699DPP0"}

This file was deleted.

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"snapshot":"XDZ162T1TYBEJMK99NPG"}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"snapshot":"394QWZDXAY74HP6Q8P3G"}
{"snapshot":"4QF8JA0YPDN51MHSSYVG"}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"snapshot":"TNE0TX645A2G7VTXFA1G"}
{"snapshot":"XDZ162T1TYBEJMK99NPG"}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"snapshot":"394QWZDXAY74HP6Q8P3G"}
{"snapshot":"4QF8JA0YPDN51MHSSYVG"}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
6 changes: 2 additions & 4 deletions icechunk-python/tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def test_virtual_chunk_containers() -> None:
container = icechunk.VirtualChunkContainer("custom", "s3://", store_config)
config.set_virtual_chunk_container(container)
assert re.match(
r"RepositoryConfig\(inline_chunk_threshold_bytes=None, unsafe_overwrite_refs=None, get_partial_values_concurrency=None, compression=None, caching=None, storage=None, manifest=.*\)",
r"RepositoryConfig\(inline_chunk_threshold_bytes=None, get_partial_values_concurrency=None, compression=None, caching=None, storage=None, manifest=.*\)",
repr(config),
)
assert config.virtual_chunk_containers
Expand Down Expand Up @@ -158,11 +158,9 @@ def test_can_change_deep_config_values() -> None:
)
config = icechunk.RepositoryConfig(
inline_chunk_threshold_bytes=11,
unsafe_overwrite_refs=False,
compression=icechunk.CompressionConfig(level=0),
)
config.inline_chunk_threshold_bytes = 5
config.unsafe_overwrite_refs = True
config.get_partial_values_concurrency = 42
config.compression = icechunk.CompressionConfig(level=8)
config.compression.level = 2
Expand All @@ -180,7 +178,7 @@ def test_can_change_deep_config_values() -> None:
)

assert re.match(
r"RepositoryConfig\(inline_chunk_threshold_bytes=5, unsafe_overwrite_refs=True, get_partial_values_concurrency=42, compression=CompressionConfig\(algorithm=None, level=2\), caching=CachingConfig\(num_snapshot_nodes=None, num_chunk_refs=8, num_transaction_changes=None, num_bytes_attributes=None, num_bytes_chunks=None\), storage=StorageSettings\(concurrency=StorageConcurrencySettings\(max_concurrent_requests_for_object=5, ideal_concurrent_request_size=1000000\)\), manifest=.*\)",
r"RepositoryConfig\(inline_chunk_threshold_bytes=5, get_partial_values_concurrency=42, compression=CompressionConfig\(algorithm=None, level=2\), caching=CachingConfig\(num_snapshot_nodes=None, num_chunk_refs=8, num_transaction_changes=None, num_bytes_attributes=None, num_bytes_chunks=None\), storage=StorageSettings\(concurrency=StorageConcurrencySettings\(max_concurrent_requests_for_object=5, ideal_concurrent_request_size=1000000\)\), manifest=.*\)",
repr(config),
)
repo = icechunk.Repository.open(
Expand Down
1 change: 0 additions & 1 deletion icechunk/examples/multithreaded_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
let storage = new_in_memory_storage().await?;
let config = RepositoryConfig {
inline_chunk_threshold_bytes: Some(128),
unsafe_overwrite_refs: Some(true),
..Default::default()
};
let repo = Repository::create(Some(config), storage, HashMap::new()).await?;
Expand Down
11 changes: 0 additions & 11 deletions icechunk/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -205,11 +205,6 @@ impl ManifestConfig {
pub struct RepositoryConfig {
/// Chunks smaller than this will be stored inline in the manifst
pub inline_chunk_threshold_bytes: Option<u16>,
/// Unsafely overwrite refs on write. This is not recommended, users should only use it at their
/// own risk in object stores for which we don't support write-object-if-not-exists. There is
/// the possibility of race conditions if this variable is set to true and there are concurrent
/// commit attempts.
pub unsafe_overwrite_refs: Option<bool>,

/// Concurrency used by the get_partial_values operation to fetch different keys in parallel
pub get_partial_values_concurrency: Option<u16>,
Expand All @@ -236,9 +231,6 @@ impl RepositoryConfig {
pub fn inline_chunk_threshold_bytes(&self) -> u16 {
self.inline_chunk_threshold_bytes.unwrap_or(512)
}
pub fn unsafe_overwrite_refs(&self) -> bool {
self.unsafe_overwrite_refs.unwrap_or(false)
}
pub fn get_partial_values_concurrency(&self) -> u16 {
self.get_partial_values_concurrency.unwrap_or(10)
}
Expand Down Expand Up @@ -268,9 +260,6 @@ impl RepositoryConfig {
inline_chunk_threshold_bytes: other
.inline_chunk_threshold_bytes
.or(self.inline_chunk_threshold_bytes),
unsafe_overwrite_refs: other
.unsafe_overwrite_refs
.or(self.unsafe_overwrite_refs),
get_partial_values_concurrency: other
.get_partial_values_concurrency
.or(self.get_partial_values_concurrency),
Expand Down
4 changes: 2 additions & 2 deletions icechunk/src/format/manifest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ fn ref_to_payload(

fn checksum(payload: &gen::ChunkRef<'_>) -> Option<Checksum> {
if let Some(etag) = payload.checksum_etag() {
Some(Checksum::ETag(etag.to_string()))
Some(Checksum::ETag(ETag(etag.to_string())))
} else if payload.checksum_last_modified() > 0 {
Some(Checksum::LastModified(SecondsSinceEpoch(payload.checksum_last_modified())))
} else {
Expand Down Expand Up @@ -398,7 +398,7 @@ fn mk_chunk_ref<'bldr>(
Some(cs) => match cs {
Checksum::LastModified(_) => None,
Checksum::ETag(etag) => {
Some(builder.create_string(etag.as_str()))
Some(builder.create_string(etag.0.as_str()))
}
},
None => None,
Expand Down
2 changes: 1 addition & 1 deletion icechunk/src/ops/gc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ pub async fn expire(
ExpireRefResult::RefIsExpired => match &r {
Ref::Tag(name) => {
if expired_tags == ExpiredRefAction::Delete {
delete_tag(storage, storage_settings, name.as_str(), false)
delete_tag(storage, storage_settings, name.as_str())
.await
.map_err(GCError::Ref)?;
result.deleted_refs.insert(r);
Expand Down
Loading

0 comments on commit 9f51c85

Please sign in to comment.