From 474e004876a1be70993f601cd5ebc102440812ea Mon Sep 17 00:00:00 2001 From: "Daniel Jahn (dahn)" Date: Sun, 23 Feb 2025 15:13:47 +0100 Subject: [PATCH] feat: unify API on snapshot_id (#770) --- Changelog.python.md | 2 +- docs/docs/icechunk-python/quickstart.md | 4 +-- docs/docs/icechunk-python/xarray.md | 2 +- .../notebooks/demo-dummy-data.ipynb | 4 +-- .../notebooks/version-control.ipynb | 4 +-- .../python/icechunk/_icechunk_python.pyi | 8 ++--- icechunk-python/python/icechunk/repository.py | 32 +++++++++++-------- icechunk-python/src/repository.rs | 22 ++++++------- icechunk-python/tests/test_can_read_old.py | 4 +-- .../tests/test_stateful_repo_ops.py | 2 +- icechunk-python/tests/test_timetravel.py | 14 ++++---- 11 files changed, 52 insertions(+), 46 deletions(-) diff --git a/Changelog.python.md b/Changelog.python.md index bd6d8441..4c75a70c 100644 --- a/Changelog.python.md +++ b/Changelog.python.md @@ -46,7 +46,7 @@ repo.readonly_session("dev") # still possible: repo.readonly_session(tag="v0.1") repo.readonly_session(branch="foo") -repo.readonly_session(snapshot="NXH3M0HJ7EEJ0699DPP0") +repo.readonly_session(snapshot_id="NXH3M0HJ7EEJ0699DPP0") ``` - Icechunk is now more resilient to changes in Zarr metadata spec, and can handle Zarr extensions. diff --git a/docs/docs/icechunk-python/quickstart.md b/docs/docs/icechunk-python/quickstart.md index a59924f3..d4b939cd 100644 --- a/docs/docs/icechunk-python/quickstart.md +++ b/docs/docs/icechunk-python/quickstart.md @@ -135,7 +135,7 @@ snapshot_id_2 = session_2.commit("overwrite some values") We can see the full version history of our repo: ```python -hist = repo.ancestry(snapshot=snapshot_id_2) +hist = repo.ancestry(snapshot_id=snapshot_id_2) for ancestor in hist: print(ancestor.id, ancestor.message, ancestor.written_at) @@ -151,7 +151,7 @@ for ancestor in hist: # latest version assert array[0] == 2 # check out earlier snapshot -earlier_session = repo.readonly_session(snapshot=hist[1].id) +earlier_session = repo.readonly_session(snapshot_id=hist[1].id) store = earlier_session.store # get the array diff --git a/docs/docs/icechunk-python/xarray.md b/docs/docs/icechunk-python/xarray.md index e1f4e2e9..257a7f64 100644 --- a/docs/docs/icechunk-python/xarray.md +++ b/docs/docs/icechunk-python/xarray.md @@ -154,7 +154,7 @@ xr.open_zarr(session.store, consolidated=False) We can also read data from previous snapshots by checking out prior versions: ```python -session = repo.readonly_session(snapshot=first_snapshot) +session = repo.readonly_session(snapshot_id=first_snapshot) xr.open_zarr(session.store, consolidated=False) # Size: 9MB diff --git a/icechunk-python/notebooks/demo-dummy-data.ipynb b/icechunk-python/notebooks/demo-dummy-data.ipynb index 9ed89e57..444847dd 100644 --- a/icechunk-python/notebooks/demo-dummy-data.ipynb +++ b/icechunk-python/notebooks/demo-dummy-data.ipynb @@ -335,7 +335,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "id": "d904f719-98cf-4f51-8e9a-1631dcb3fcba", "metadata": {}, "outputs": [ @@ -348,7 +348,7 @@ } ], "source": [ - "session = repo.readonly_session(snapshot=first_commit)\n", + "session = repo.readonly_session(snapshot_id=first_commit)\n", "root_group = zarr.open_group(session.store, mode=\"r\")\n", "\n", "try:\n", diff --git a/icechunk-python/notebooks/version-control.ipynb b/icechunk-python/notebooks/version-control.ipynb index 74c44cf0..abde76db 100644 --- a/icechunk-python/notebooks/version-control.ipynb +++ b/icechunk-python/notebooks/version-control.ipynb @@ -242,7 +242,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "e785d9a1-36ec-4207-b334-20e0a68e3ac8", "metadata": {}, "outputs": [ @@ -258,7 +258,7 @@ } ], "source": [ - "session = repo.readonly_session(snapshot=first_commit)\n", + "session = repo.readonly_session(snapshot_id=first_commit)\n", "root_group = zarr.open_group(store=session.store, mode=\"r\")\n", "dict(root_group.attrs)" ] diff --git a/icechunk-python/python/icechunk/_icechunk_python.pyi b/icechunk-python/python/icechunk/_icechunk_python.pyi index 7b9fa91c..c98fed8d 100644 --- a/icechunk-python/python/icechunk/_icechunk_python.pyi +++ b/icechunk-python/python/icechunk/_icechunk_python.pyi @@ -957,7 +957,7 @@ class PyRepository: *, branch: str | None = None, tag: str | None = None, - snapshot: str | None = None, + snapshot_id: str | None = None, ) -> AsyncIterator[SnapshotInfo]: ... def create_branch(self, branch: str, snapshot_id: str) -> None: ... def list_branches(self) -> set[str]: ... @@ -972,17 +972,17 @@ class PyRepository: self, from_branch: str | None = None, from_tag: str | None = None, - from_snapshot: str | None = None, + from_snapshot_id: str | None = None, to_branch: str | None = None, to_tag: str | None = None, - to_snapshot: str | None = None, + to_snapshot_id: str | None = None, ) -> Diff: ... def readonly_session( self, branch: str | None = None, *, tag: str | None = None, - snapshot: str | None = None, + snapshot_id: str | None = None, ) -> PySession: ... def writable_session(self, branch: str) -> PySession: ... def expire_snapshots( diff --git a/icechunk-python/python/icechunk/repository.py b/icechunk-python/python/icechunk/repository.py index 5a6f2ad3..3d26dfb1 100644 --- a/icechunk-python/python/icechunk/repository.py +++ b/icechunk-python/python/icechunk/repository.py @@ -196,7 +196,7 @@ def ancestry( *, branch: str | None = None, tag: str | None = None, - snapshot: str | None = None, + snapshot_id: str | None = None, ) -> Iterator[SnapshotInfo]: """ Get the ancestry of a snapshot. @@ -207,7 +207,7 @@ def ancestry( The branch to get the ancestry of. tag : str, optional The tag to get the ancestry of. - snapshot : str, optional + snapshot_id : str, optional The snapshot ID to get the ancestry of. Returns @@ -223,7 +223,9 @@ def ancestry( # the returned object is both an Async and Sync iterator res = cast( Iterator[SnapshotInfo], - self._repository.async_ancestry(branch=branch, tag=tag, snapshot=snapshot), + self._repository.async_ancestry( + branch=branch, tag=tag, snapshot_id=snapshot_id + ), ) return res @@ -232,7 +234,7 @@ def async_ancestry( *, branch: str | None = None, tag: str | None = None, - snapshot: str | None = None, + snapshot_id: str | None = None, ) -> AsyncIterator[SnapshotInfo]: """ Get the ancestry of a snapshot. @@ -243,7 +245,7 @@ def async_ancestry( The branch to get the ancestry of. tag : str, optional The tag to get the ancestry of. - snapshot : str, optional + snapshot_id : str, optional The snapshot ID to get the ancestry of. Returns @@ -255,7 +257,9 @@ def async_ancestry( ----- Only one of the arguments can be specified. """ - return self._repository.async_ancestry(branch=branch, tag=tag, snapshot=snapshot) + return self._repository.async_ancestry( + branch=branch, tag=tag, snapshot_id=snapshot_id + ) def create_branch(self, branch: str, snapshot_id: str) -> None: """ @@ -400,10 +404,10 @@ def diff( *, from_branch: str | None = None, from_tag: str | None = None, - from_snapshot: str | None = None, + from_snapshot_id: str | None = None, to_branch: str | None = None, to_tag: str | None = None, - to_snapshot: str | None = None, + to_snapshot_id: str | None = None, ) -> Diff: """ Compute an overview of the operations executed from version `from` to version `to`. @@ -421,10 +425,10 @@ def diff( return self._repository.diff( from_branch=from_branch, from_tag=from_tag, - from_snapshot=from_snapshot, + from_snapshot_id=from_snapshot_id, to_branch=to_branch, to_tag=to_tag, - to_snapshot=to_snapshot, + to_snapshot_id=to_snapshot_id, ) def readonly_session( @@ -432,7 +436,7 @@ def readonly_session( branch: str | None = None, *, tag: str | None = None, - snapshot: str | None = None, + snapshot_id: str | None = None, ) -> Session: """ Create a read-only session. @@ -447,7 +451,7 @@ def readonly_session( If provided, the branch to create the session on. tag : str, optional If provided, the tag to create the session on. - snapshot : str, optional + snapshot_id : str, optional If provided, the snapshot ID to create the session on. Returns @@ -460,7 +464,9 @@ def readonly_session( Only one of the arguments can be specified. """ return Session( - self._repository.readonly_session(branch=branch, tag=tag, snapshot=snapshot) + self._repository.readonly_session( + branch=branch, tag=tag, snapshot_id=snapshot_id + ) ) def writable_session(self, branch: str) -> Session: diff --git a/icechunk-python/src/repository.rs b/icechunk-python/src/repository.rs index d51f1ee6..ed11d0c2 100644 --- a/icechunk-python/src/repository.rs +++ b/icechunk-python/src/repository.rs @@ -506,18 +506,18 @@ impl PyRepository { } /// Returns an object that is both a sync and an async iterator - #[pyo3(signature = (*, branch = None, tag = None, snapshot = None))] + #[pyo3(signature = (*, branch = None, tag = None, snapshot_id = None))] pub fn async_ancestry( &self, py: Python<'_>, branch: Option, tag: Option, - snapshot: Option, + snapshot_id: Option, ) -> PyResult { let repo = Arc::clone(&self.0); // This function calls block_on, so we need to allow other thread python to make progress py.allow_threads(move || { - let version = args_to_version_info(branch, tag, snapshot)?; + let version = args_to_version_info(branch, tag, snapshot_id)?; let ancestry = pyo3_async_runtimes::tokio::get_runtime() .block_on(async move { repo.ancestry_arc(&version).await }) .map_err(PyIcechunkStoreError::RepositoryError)? @@ -689,20 +689,20 @@ impl PyRepository { }) } - #[pyo3(signature = (*, from_branch=None, from_tag=None, from_snapshot=None, to_branch=None, to_tag=None, to_snapshot=None))] + #[pyo3(signature = (*, from_branch=None, from_tag=None, from_snapshot_id=None, to_branch=None, to_tag=None, to_snapshot_id=None))] #[allow(clippy::too_many_arguments)] pub fn diff( &self, py: Python<'_>, from_branch: Option, from_tag: Option, - from_snapshot: Option, + from_snapshot_id: Option, to_branch: Option, to_tag: Option, - to_snapshot: Option, + to_snapshot_id: Option, ) -> PyResult { - let from = args_to_version_info(from_branch, from_tag, from_snapshot)?; - let to = args_to_version_info(to_branch, to_tag, to_snapshot)?; + let from = args_to_version_info(from_branch, from_tag, from_snapshot_id)?; + let to = args_to_version_info(to_branch, to_tag, to_snapshot_id)?; // This function calls block_on, so we need to allow other thread python to make progress py.allow_threads(move || { @@ -717,17 +717,17 @@ impl PyRepository { }) } - #[pyo3(signature = (*, branch = None, tag = None, snapshot = None))] + #[pyo3(signature = (*, branch = None, tag = None, snapshot_id = None))] pub fn readonly_session( &self, py: Python<'_>, branch: Option, tag: Option, - snapshot: Option, + snapshot_id: Option, ) -> PyResult { // This function calls block_on, so we need to allow other thread python to make progress py.allow_threads(move || { - let version = args_to_version_info(branch, tag, snapshot)?; + let version = args_to_version_info(branch, tag, snapshot_id)?; let session = pyo3_async_runtimes::tokio::get_runtime().block_on(async move { self.0 diff --git a/icechunk-python/tests/test_can_read_old.py b/icechunk-python/tests/test_can_read_old.py index 2e89c2d0..4931244c 100644 --- a/icechunk-python/tests/test_can_read_old.py +++ b/icechunk-python/tests/test_can_read_old.py @@ -177,7 +177,7 @@ async def test_icechunk_can_read_old_repo() -> None: "Repository initialized", ] assert [ - p.message for p in repo.ancestry(snapshot=main_snapshot) + p.message for p in repo.ancestry(snapshot_id=main_snapshot) ] == expected_main_history expected_branch_history = [ @@ -256,7 +256,7 @@ async def test_icechunk_can_read_old_repo() -> None: assert_array_equal(big_chunks[:], 42.0) parents = list(repo.ancestry(branch="main")) - diff = repo.diff(to_branch="main", from_snapshot=parents[-2].id) + diff = repo.diff(to_branch="main", from_snapshot_id=parents[-2].id) assert diff.new_groups == set() assert diff.new_arrays == set() assert set(diff.updated_chunks.keys()) == { diff --git a/icechunk-python/tests/test_stateful_repo_ops.py b/icechunk-python/tests/test_stateful_repo_ops.py index 1b6f5ef5..3d3a0695 100644 --- a/icechunk-python/tests/test_stateful_repo_ops.py +++ b/icechunk-python/tests/test_stateful_repo_ops.py @@ -271,7 +271,7 @@ def commit(self, message): @rule(ref=commits) def checkout_commit(self, ref): note(f"Checking out commit {ref}") - self.session = self.repo.readonly_session(snapshot=ref) + self.session = self.repo.readonly_session(snapshot_id=ref) assert self.session.read_only self.model.checkout_commit(ref) diff --git a/icechunk-python/tests/test_timetravel.py b/icechunk-python/tests/test_timetravel.py index 09a08cf2..79303a74 100644 --- a/icechunk-python/tests/test_timetravel.py +++ b/icechunk-python/tests/test_timetravel.py @@ -58,14 +58,14 @@ def test_timetravel() -> None: new_snapshot_id = session.commit("commit 2") - session = repo.readonly_session(snapshot=first_snapshot_id) + session = repo.readonly_session(snapshot_id=first_snapshot_id) store = session.store group = zarr.open_group(store=store, mode="r") air_temp = cast(zarr.core.array.Array, group["air_temp"]) assert store.read_only assert air_temp[200, 6] == 42 - session = repo.readonly_session(snapshot=new_snapshot_id) + session = repo.readonly_session(snapshot_id=new_snapshot_id) store = session.store group = zarr.open_group(store=store, mode="r") air_temp = cast(zarr.core.array.Array, group["air_temp"]) @@ -116,7 +116,7 @@ def test_timetravel() -> None: air_temp = cast(zarr.core.array.Array, group["air_temp"]) assert air_temp[200, 6] == 90 - parents = list(repo.ancestry(snapshot=feature_snapshot_id)) + parents = list(repo.ancestry(snapshot_id=feature_snapshot_id)) assert [snap.message for snap in parents] == [ "commit 3", "commit 2", @@ -128,7 +128,7 @@ def test_timetravel() -> None: assert list(repo.ancestry(tag="v1.0")) == parents assert list(repo.ancestry(branch="feature-not-dead")) == parents - diff = repo.diff(to_tag="v1.0", from_snapshot=parents[-1].id) + diff = repo.diff(to_tag="v1.0", from_snapshot_id=parents[-1].id) assert diff.new_groups == {"/"} assert diff.new_arrays == {"/air_temp"} assert list(diff.updated_chunks.keys()) == ["/air_temp"] @@ -185,11 +185,11 @@ def test_timetravel() -> None: with pytest.raises(ValueError, match="doesn't include"): # if we call diff in the wrong order it fails with a message - repo.diff(from_tag="v1.0", to_snapshot=parents[-1].id) + repo.diff(from_tag="v1.0", to_snapshot_id=parents[-1].id) # check async ancestry works - assert list(repo.ancestry(snapshot=feature_snapshot_id)) == asyncio.run( - async_ancestry(repo, snapshot=feature_snapshot_id) + assert list(repo.ancestry(snapshot_id=feature_snapshot_id)) == asyncio.run( + async_ancestry(repo, snapshot_id=feature_snapshot_id) ) assert list(repo.ancestry(tag="v1.0")) == asyncio.run( async_ancestry(repo, tag="v1.0")