diff --git a/ci/upstream.yml b/ci/upstream.yml index 2a8c1da3..597484ac 100644 --- a/ci/upstream.yml +++ b/ci/upstream.yml @@ -29,6 +29,6 @@ dependencies: - fsspec - pip - pip: - - icechunk>=0.1.0a12 # Installs python-zarr v3 as dependency + - icechunk>=0.1.0a12 # Installs zarr-python v3.0.0 as dependency - git+https://github.com/fsspec/kerchunk.git@main - imagecodecs-numcodecs==2024.6.1 diff --git a/docs/releases.rst b/docs/releases.rst index 2bbc74cf..32a4cc03 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -13,6 +13,7 @@ New Features (:pull:`309`) By `Tom Nicholas `_. - Added a ``.nbytes`` accessor method which displays the bytes needed to hold the virtual references in memory. (:issue:`167`, :pull:`227`) By `Tom Nicholas `_. +- Upgrade icechunk dependency to ``>=0.1.0a12``. (:pull:`406`) By `Julia Signell `_. - Sync with Icechunk v0.1.0a8 (:pull:`368`) By `Matthew Iannucci `. This also adds support for the `to_icechunk` method to add timestamps as checksums when writing virtual references to an icechunk store. This is useful for ensuring that virtual references are not stale when reading from an icechunk store, which can happen if the @@ -40,7 +41,7 @@ Breaking changes - The default backend for netCDF4 and HDF5 is now the custom ``HDFVirtualBackend`` replacing the previous default which was a wrapper around the kerchunk backend. (:issue:`374`, :pull:`395`) By `Julia Signell `_. -- Optional dependency on kerchunk is now the newly-released v0.2.8. This release of kerchunk is compatible with zarr-python v3, +- Optional dependency on kerchunk is now the newly-released v0.2.8. This release of kerchunk is compatible with zarr-python v3.0.0, which means a released version of kerchunk can now be used with both VirtualiZarr and Icechunk. (:issue:`392`, :pull:`406`, :pull:`412``) By `Julia Signell `_ and `Tom Nicholas `_. diff --git a/virtualizarr/tests/test_writers/test_icechunk.py b/virtualizarr/tests/test_writers/test_icechunk.py index b5f2048e..b7600c2e 100644 --- a/virtualizarr/tests/test_writers/test_icechunk.py +++ b/virtualizarr/tests/test_writers/test_icechunk.py @@ -20,7 +20,11 @@ from virtualizarr.zarr import ZArray if TYPE_CHECKING: - from icechunk import IcechunkStore, Storage # type: ignore[import-not-found] + from icechunk import ( # type: ignore[import-not-found] + IcechunkStore, + Repository, + Storage, + ) @pytest.fixture(scope="function") @@ -31,11 +35,16 @@ def icechunk_storage(tmp_path: Path) -> "Storage": @pytest.fixture(scope="function") -def icechunk_filestore(icechunk_storage: "Storage") -> "IcechunkStore": +def icechunk_repo(icechunk_storage: "Storage") -> "Repository": from icechunk import Repository repo = Repository.create(storage=icechunk_storage) - session = repo.writable_session("main") + return repo + + +@pytest.fixture(scope="function") +def icechunk_filestore(icechunk_repo: "Repository") -> "IcechunkStore": + session = icechunk_repo.writable_session("main") return session.store @@ -537,33 +546,32 @@ class TestAppend: # Success cases ## When appending to a single virtual ref without encoding, it succeeds def test_append_virtual_ref_without_encoding( - self, icechunk_storage: "Storage", simple_netcdf4: str + self, icechunk_repo: "Repository", simple_netcdf4: str ): import xarray.testing as xrt - from icechunk import Repository # generate virtual dataset vds = gen_virtual_dataset(file_uri=simple_netcdf4) - # create the icechunk store and commit the first virtual dataset - repo = Repository.create(storage=icechunk_storage) - session = repo.writable_session("main") - vds.virtualize.to_icechunk(session.store) - session.commit( + # Commit the first virtual dataset + writable_session = icechunk_repo.writable_session("main") + vds.virtualize.to_icechunk(writable_session.store) + writable_session.commit( "test commit" ) # need to commit it in order to append to it in the next lines + append_session = icechunk_repo.writable_session("main") # Append the same dataset to the same store - icechunk_filestore_append = repo.writable_session("main") - vds.virtualize.to_icechunk(icechunk_filestore_append.store, append_dim="x") - icechunk_filestore_append.commit("appended data") + vds.virtualize.to_icechunk(append_session.store, append_dim="x") + append_session.commit("appended data") - icechunk_filestore_append = repo.writable_session("main") - vds.virtualize.to_icechunk(icechunk_filestore_append.store, append_dim="x") - icechunk_filestore_append.commit("appended data again") + second_append_session = icechunk_repo.writable_session("main") + vds.virtualize.to_icechunk(second_append_session.store, append_dim="x") + second_append_session.commit("appended data again") + read_session = icechunk_repo.readonly_session(branch="main") with ( xr.open_zarr( - icechunk_filestore_append.store, consolidated=False, zarr_format=3 + read_session.store, consolidated=False, zarr_format=3 ) as array, xr.open_dataset(simple_netcdf4) as expected_ds, ): @@ -571,10 +579,9 @@ def test_append_virtual_ref_without_encoding( xrt.assert_identical(array, expected_array) def test_append_virtual_ref_with_encoding( - self, icechunk_storage: "Storage", netcdf4_files_factory: Callable + self, icechunk_repo: "Repository", netcdf4_files_factory: Callable ): import xarray.testing as xrt - from icechunk import Repository scale_factor = 0.01 encoding = {"air": {"scale_factor": scale_factor}} @@ -605,8 +612,7 @@ def test_append_virtual_ref_with_encoding( ), ) - # create the icechunk store and commit the first virtual dataset - icechunk_repo = Repository.create(storage=icechunk_storage) + # Commit the first virtual dataset icechunk_filestore = icechunk_repo.writable_session("main") vds1.virtualize.to_icechunk(icechunk_filestore.store) icechunk_filestore.commit( @@ -633,10 +639,9 @@ def test_append_virtual_ref_with_encoding( ## When appending to a virtual ref with encoding, it succeeds @pytest.mark.asyncio async def test_append_with_multiple_root_arrays( - self, icechunk_storage: "Storage", netcdf4_files_factory: Callable + self, icechunk_repo: "Repository", netcdf4_files_factory: Callable ): import xarray.testing as xrt - from icechunk import Repository from zarr.core.buffer import default_buffer_prototype filepath1, filepath2 = netcdf4_files_factory( @@ -713,8 +718,6 @@ async def test_append_with_multiple_root_arrays( ), ) - # create the icechunk store and commit the first virtual dataset - icechunk_repo = Repository.create(storage=icechunk_storage) icechunk_filestore = icechunk_repo.writable_session("main") vds1.virtualize.to_icechunk(icechunk_filestore.store) icechunk_filestore.commit( @@ -748,12 +751,11 @@ async def test_append_with_multiple_root_arrays( @pytest.mark.parametrize("zarr_format", [2, 3]) def test_append_with_compression_succeeds( self, - icechunk_storage: "Storage", + icechunk_repo: "Repository", netcdf4_files_factory: Callable, zarr_format: Literal[2, 3], ): import xarray.testing as xrt - from icechunk import Repository encoding = { "air": { @@ -792,8 +794,7 @@ def test_append_with_compression_succeeds( ), ) - # Create icechunk store and commit the compressed dataset - icechunk_repo = Repository.create(storage=icechunk_storage) + # Commit the compressed dataset icechunk_filestore = icechunk_repo.writable_session("main") vds1.virtualize.to_icechunk(icechunk_filestore.store) icechunk_filestore.commit("test commit") @@ -815,15 +816,12 @@ def test_append_with_compression_succeeds( ## When chunk shapes are different it fails def test_append_with_different_chunking_fails( - self, icechunk_storage: "Storage", simple_netcdf4: str + self, icechunk_repo: "Repository", simple_netcdf4: str ): - from icechunk import Repository - # Generate a virtual dataset with specific chunking vds = gen_virtual_dataset(file_uri=simple_netcdf4, chunk_shape=(3, 4)) - # Create icechunk store and commit the dataset - icechunk_repo = Repository.create(storage=icechunk_storage) + # Commit the dataset icechunk_filestore = icechunk_repo.writable_session("main") vds.virtualize.to_icechunk(icechunk_filestore.store) icechunk_filestore.commit("test commit") @@ -842,10 +840,8 @@ def test_append_with_different_chunking_fails( ## When encoding is different it fails def test_append_with_different_encoding_fails( - self, icechunk_storage: "Storage", simple_netcdf4: str + self, icechunk_repo: "Repository", simple_netcdf4: str ): - from icechunk import Repository - # Generate datasets with different encoding vds1 = gen_virtual_dataset( file_uri=simple_netcdf4, encoding={"scale_factor": 0.1} @@ -854,8 +850,7 @@ def test_append_with_different_encoding_fails( file_uri=simple_netcdf4, encoding={"scale_factor": 0.01} ) - # Create icechunk store and commit the first dataset - icechunk_repo = Repository.create(storage=icechunk_storage) + # Commit the first dataset icechunk_filestore = icechunk_repo.writable_session("main") vds1.virtualize.to_icechunk(icechunk_filestore.store) icechunk_filestore.commit("test commit") @@ -869,10 +864,8 @@ def test_append_with_different_encoding_fails( vds2.virtualize.to_icechunk(icechunk_filestore_append.store, append_dim="x") def test_dimensions_do_not_align( - self, icechunk_storage: "Storage", simple_netcdf4: str + self, icechunk_repo: "Repository", simple_netcdf4: str ): - from icechunk import Repository - # Generate datasets with different lengths on the non-append dimension (x) vds1 = gen_virtual_dataset( # {'x': 5, 'y': 4} @@ -885,8 +878,7 @@ def test_dimensions_do_not_align( shape=(6, 4), ) - # Create icechunk store and commit the first dataset - icechunk_repo = Repository.create(storage=icechunk_storage) + # Commit the first dataset icechunk_filestore = icechunk_repo.writable_session("main") vds1.virtualize.to_icechunk(icechunk_filestore.store) icechunk_filestore.commit("test commit") @@ -897,18 +889,15 @@ def test_dimensions_do_not_align( vds2.virtualize.to_icechunk(icechunk_filestore_append.store, append_dim="y") def test_append_dim_not_in_dims_raises_error( - self, icechunk_storage: "Storage", simple_netcdf4: str + self, icechunk_repo: "Repository", simple_netcdf4: str ): """ Test that attempting to append with an append_dim not present in dims raises a ValueError. """ - from icechunk import Repository - vds = gen_virtual_dataset( file_uri=simple_netcdf4, shape=(5, 4), chunk_shape=(5, 4), dims=["x", "y"] ) - icechunk_repo = Repository.create(storage=icechunk_storage) icechunk_filestore = icechunk_repo.writable_session("main") vds.virtualize.to_icechunk(icechunk_filestore.store) icechunk_filestore.commit("initial commit")