earth-mover · mpiannucci · Oct 6, 2024 · Oct 7, 2024 · Oct 7, 2024 · Oct 7, 2024
diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs
@@ -757,26 +757,13 @@ impl Repository {
         Ok(existing_array_chunks.chain(new_array_chunks))
     }
 
-    pub async fn distributed_flush<I: IntoIterator<Item = ChangeSet>>(
+    pub async fn merge<I: IntoIterator<Item = Repository>>(
         &mut self,
-        other_change_sets: I,
-        message: &str,
-        properties: SnapshotProperties,
-    ) -> RepositoryResult<SnapshotId> {
-        // FIXME: this clone can be avoided
-        let change_sets = iter::once(self.change_set.clone()).chain(other_change_sets);
-        let new_snapshot_id = distributed_flush(
-            self.storage.as_ref(),
-            self.snapshot_id(),
-            change_sets,
-            message,
-            properties,
-        )
-        .await?;
-
-        self.snapshot_id = new_snapshot_id.clone();
-        self.change_set = ChangeSet::default();
-        Ok(new_snapshot_id)
+        other_repositories: I,
+    ) -> RepositoryResult<()> {
+        let change_sets = other_repositories.into_iter().map(|r| r.change_set);
+        self.change_set.merge_many(change_sets);
+        Ok(())
     }
 
     /// After changes to the repository have been made, this generates and writes to `Storage` the updated datastructures.
@@ -791,36 +778,36 @@ impl Repository {
         message: &str,
         properties: SnapshotProperties,
     ) -> RepositoryResult<SnapshotId> {
-        self.distributed_flush(iter::empty(), message, properties).await
-    }
+        // TODO: can this clone can be avoided? its difficult because
+        // self is borrows for flush and the change set should only
+        // be cleared after the flush is successful.
+        let mut change_set = self.change_set.clone();
 
-    pub async fn commit(
-        &mut self,
-        update_branch_name: &str,
-        message: &str,
-        properties: Option<SnapshotProperties>,
-    ) -> RepositoryResult<SnapshotId> {
-        self.distributed_commit(update_branch_name, iter::empty(), message, properties)
-            .await
+        let new_snapshot_id = flush(
+            self.storage.as_ref(),
+            self.snapshot_id(),
+            &mut change_set,
+            message,
+            properties,
+        )
+        .await?;
+
+        self.snapshot_id = new_snapshot_id.clone();
+        self.change_set = ChangeSet::default();
+        Ok(new_snapshot_id)
     }
 
-    pub async fn distributed_commit<I: IntoIterator<Item = ChangeSet>>(
+    pub async fn commit(
         &mut self,
         update_branch_name: &str,
-        other_change_sets: I,
         message: &str,
         properties: Option<SnapshotProperties>,
     ) -> RepositoryResult<SnapshotId> {
         let current = fetch_branch_tip(self.storage.as_ref(), update_branch_name).await;
+
         match current {
             Err(RefError::RefNotFound(_)) => {
-                self.do_distributed_commit(
-                    update_branch_name,
-                    other_change_sets,
-                    message,
-                    properties,
-                )
-                .await
+                self.do_commit(update_branch_name, message, properties).await
             }
             Err(err) => Err(err.into()),
             Ok(ref_data) => {
@@ -831,29 +818,21 @@ impl Repository {
                         actual_parent: Some(ref_data.snapshot.clone()),
                     })
                 } else {
-                    self.do_distributed_commit(
-                        update_branch_name,
-                        other_change_sets,
-                        message,
-                        properties,
-                    )
-                    .await
+                    self.do_commit(update_branch_name, message, properties).await
                 }
             }
         }
     }
 
-    async fn do_distributed_commit<I: IntoIterator<Item = ChangeSet>>(
+    async fn do_commit(
         &mut self,
         update_branch_name: &str,
-        other_change_sets: I,
         message: &str,
         properties: Option<SnapshotProperties>,
     ) -> RepositoryResult<SnapshotId> {
         let parent_snapshot = self.snapshot_id.clone();
         let properties = properties.unwrap_or_default();
-        let new_snapshot =
-            self.distributed_flush(other_change_sets, message, properties).await?;
+        let new_snapshot = self.flush(message, properties).await?;
 
         match update_branch(
             self.storage.as_ref(),
@@ -997,19 +976,13 @@ async fn updated_nodes<'a>(
         .chain(change_set.new_nodes_iterator(manifest_id)))
 }
 
-async fn distributed_flush<I: IntoIterator<Item = ChangeSet>>(
+async fn flush(
     storage: &(dyn Storage + Send + Sync),
     parent_id: &SnapshotId,
-    change_sets: I,
+    change_set: &mut ChangeSet,
     message: &str,
     properties: SnapshotProperties,
 ) -> RepositoryResult<SnapshotId> {
-    let mut change_set = ChangeSet::default();
-    change_set.merge_many(change_sets);
-    if change_set.is_empty() {
-        return Err(RepositoryError::NoChangesToCommit);
-    }
-
     // We search for the current manifest. We are assumming a single one for now
     let old_snapshot = storage.fetch_snapshot(parent_id).await?;
     let old_snapshot_c = Arc::clone(&old_snapshot);
@@ -1038,8 +1011,8 @@ async fn distributed_flush<I: IntoIterator<Item = ChangeSet>>(
     // As a solution, we temporarily `take` the map, replacing it an empty one, run the thread,
     // and at the end we put the map back to where it was, in case there is some later failure.
     // We always want to leave things in the previous state if there was a failure.
-
-    let chunk_changes = Arc::new(change_set.take_chunks());
+    let chunks = change_set.take_chunks();
+    let chunk_changes = Arc::new(chunks);
     let chunk_changes_c = Arc::clone(&chunk_changes);
 
     let update_task = task::spawn_blocking(move || {
@@ -1070,7 +1043,7 @@ async fn distributed_flush<I: IntoIterator<Item = ChangeSet>>(
                 .await?;
 
             let all_nodes =
-                updated_nodes(storage, &change_set, parent_id, &new_manifest_id).await?;
+                updated_nodes(storage, change_set, parent_id, &new_manifest_id).await?;
 
             let mut new_snapshot = Snapshot::from_iter(
                 old_snapshot.as_ref(),

diff --git a/icechunk/src/zarr.rs b/icechunk/src/zarr.rs
@@ -262,6 +262,8 @@ pub enum StoreError {
     NotFound(#[from] KeyNotFoundError),
     #[error("unsuccessful repository operation: `{0}`")]
     RepositoryError(#[from] RepositoryError),
+    #[error("error merging stores: `{0}`")]
+    MergeError(String),
     #[error("cannot commit when no snapshot is present")]
     NoSnapshot,
     #[error("all commits must be made on a branch")]
@@ -435,33 +437,44 @@ impl Store {
         Ok((snapshot_id, version))
     }
 
+    pub async fn merge<I: IntoIterator<Item = Store>>(
+        &self,
+        other_stores: I,
+    ) -> StoreResult<()> {
+        let repositories = other_stores
+            .into_iter()
+            .enumerate()
+            .map(|(i, store)| {
+                let repository_lock =
+                    Arc::try_unwrap(store.repository).map_err(|_| {
+                        StoreError::MergeError(format!(
+                            "store at index {i} in merge operation is still in use"
+                        ))
+                    })?;
+                let repository = repository_lock.into_inner();
+                Ok(repository)
+            })
+            .collect::<Result<Vec<_>, StoreError>>()?;
+
+        self.repository.write().await.merge(repositories).await?;
+        Ok(())
+    }
+
     /// Commit the current changes to the current branch. If the store is not currently
     /// on a branch, this will return an error.
     pub async fn commit(&mut self, message: &str) -> StoreResult<SnapshotId> {
-        self.distributed_commit(message, vec![]).await
-    }
+        let Some(branch) = &self.current_branch else {
+            return Err(StoreError::NotOnBranch);
+        };
 
-    pub async fn distributed_commit<'a, I: IntoIterator<Item = Vec<u8>>>(
-        &mut self,
-        message: &str,
-        other_changesets_bytes: I,
-    ) -> StoreResult<SnapshotId> {
-        if let Some(branch) = &self.current_branch {
-            let other_change_sets: Vec<ChangeSet> = other_changesets_bytes
-                .into_iter()
-                .map(|v| ChangeSet::import_from_bytes(v.as_slice()))
-                .try_collect()?;
-            let result = self
-                .repository
-                .write()
-                .await
-                .deref_mut()
-                .distributed_commit(branch, other_change_sets, message, None)
-                .await?;
-            Ok(result)
-        } else {
-            Err(StoreError::NotOnBranch)
-        }
+        let result = self
+            .repository
+            .write()
+            .await
+            .deref_mut()
+            .commit(branch, message, None)
+            .await?;
+        Ok(result)
     }
 
     /// Tag the given snapshot with a specified tag

diff --git a/icechunk/tests/test_distributed_writes.rs b/icechunk/tests/test_distributed_writes.rs
@@ -6,7 +6,7 @@ use bytes::Bytes;
 use icechunk::{
     format::{ByteRange, ChunkIndices, Path, SnapshotId},
     metadata::{ChunkKeyEncoding, ChunkShape, DataType, FillValue},
-    repository::{get_chunk, ChangeSet, ZarrArrayMetadata},
+    repository::{get_chunk, ZarrArrayMetadata},
     storage::s3::{S3Config, S3Credentials, S3Storage},
     Repository, Storage,
 };
@@ -171,17 +171,10 @@ async fn test_distributed_writes() -> Result<(), Box<dyn std::error::Error + Sen
     let repo3 = write_results.pop().unwrap().unwrap();
     let repo4 = write_results.pop().unwrap().unwrap();
 
-    // We get the ChangeSet from repos 2, 3 and 4, by converting them into bytes.
-    // This simulates a marshalling  operation from a remote writer.
-    let change_sets: Vec<ChangeSet> = vec![repo2.into(), repo3.into(), repo4.into()];
-    let change_sets_bytes = change_sets.iter().map(|cs| cs.export_to_bytes().unwrap());
-    let change_sets = change_sets_bytes
-        .map(|bytes| ChangeSet::import_from_bytes(bytes.as_slice()).unwrap());
-
     // Distributed commit now, using arbitrarily one of the repos as base and the others as extra
     // changesets
-    let _new_snapshot =
-        repo1.distributed_commit("main", change_sets, "distributed commit", None).await?;
+    repo1.merge(vec![repo2, repo3, repo4]).await?;
+    let _new_snapshot = repo1.commit("main", "distributed commit", None).await?;
 
     // We check we can read all chunks correctly
     verify(repo1).await?;