From c54a052757d8d98bb1f5b9dbc9a182111cdca93a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 26 Sep 2024 20:26:51 -0500 Subject: [PATCH 01/74] Remove zarr pin --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0078a346b75..60fbff9378d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,7 @@ dev = [ "ruff", "xarray[complete]", ] -io = ["netCDF4", "h5netcdf", "scipy", 'pydap; python_version<"3.10"', "zarr<3", "fsspec", "cftime", "pooch"] +io = ["netCDF4", "h5netcdf", "scipy", 'pydap; python_version<"3.10"', "zarr", "fsspec", "cftime", "pooch"] parallel = ["dask[complete]"] viz = ["matplotlib", "seaborn", "nc-time-axis"] From 483eb7f7d550a2f12fb27485a42cc540cc43c007 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 23 Sep 2024 13:35:44 -0500 Subject: [PATCH 02/74] Define zarr_v3 helper --- xarray/backends/zarr.py | 14 ++++++++++++++ xarray/tests/test_backends.py | 7 +++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 2c6b50b3589..e6fa551af93 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1,5 +1,6 @@ from __future__ import annotations +import functools import json import os import warnings @@ -7,6 +8,7 @@ from typing import TYPE_CHECKING, Any import numpy as np +import packaging.version import pandas as pd from xarray import coding, conventions @@ -40,6 +42,18 @@ from xarray.core.dataset import Dataset from xarray.core.datatree import DataTree + + +@functools.lru_cache +def _zarr_v3() -> bool: + try: + import zarr + except ImportError: + return False + else: + return packaging.version.parse(zarr.__version__).major >= 3 + + # need some special secret attributes to tell us the dimensions DIMENSION_KEY = "_ARRAY_DIMENSIONS" diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index ccf1bc73dd6..b8f65bb0731 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -111,13 +111,12 @@ except ImportError: KVStore = None -have_zarr_v3 = False +have_zarr_v3 = backends.zarr._zarr_v3() + try: # as of Zarr v2.13 these imports require environment variable # ZARR_V3_EXPERIMENTAL_API=1 - from zarr import DirectoryStoreV3, KVStoreV3 - - have_zarr_v3 = True + from zarr.store.memory import MemoryStore as KVStoreV3 except ImportError: KVStoreV3 = None From 40a746cbebd6fb511e49d0033044b2e7e8bc5fda Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 23 Sep 2024 13:43:33 -0500 Subject: [PATCH 03/74] zarr-v3: filters / compressors -> codecs --- doc/user-guide/io.rst | 3 ++- xarray/backends/zarr.py | 33 ++++++++++++++++++++++---- xarray/tests/test_backends.py | 13 +++++++--- xarray/tests/test_backends_datatree.py | 4 ++-- 4 files changed, 43 insertions(+), 10 deletions(-) diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index 1eb979e52f6..295d754c202 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -845,8 +845,9 @@ For example: .. ipython:: python import zarr + from numcodecs.blosc import Blosc - compressor = zarr.Blosc(cname="zstd", clevel=3, shuffle=2) + compressor = Blosc(cname="zstd", clevel=3, shuffle=2) ds.to_zarr("foo.zarr", encoding={"foo": {"compressor": compressor}}) .. note:: diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index e6fa551af93..44aa4462e37 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -89,7 +89,7 @@ def __init__(self, zarr_array): self.shape = self._array.shape # preserve vlen string object dtype (GH 7328) - if self._array.filters is not None and any( + if not _zarr_v3() and self._array.filters is not None and any( [filt.codec_id == "vlen-utf8" for filt in self._array.filters] ): dtype = coding.strings.create_vlen_dtype(str) @@ -337,6 +337,7 @@ def extract_zarr_variable_encoding( "filters", "cache_metadata", "write_empty_chunks", + "codec_pipeline", } for k in safe_to_drop: @@ -629,9 +630,21 @@ def open_store_variable(self, name, zarr_array=None): encoding = { "chunks": zarr_array.chunks, "preferred_chunks": dict(zip(dimensions, zarr_array.chunks, strict=True)), - "compressor": zarr_array.compressor, - "filters": zarr_array.filters, } + + if _zarr_v3() and zarr_array.metadata.zarr_format == 3: + encoding["codec_pipeline"] = [x.to_dict() for x in zarr_array.metadata.codecs] + elif _zarr_v3(): + encoding.update({ + "compressor": zarr_array.metadata.compressor, + "filters": zarr_array.metadata.filters, + }) + else: + encoding.update({ + "compressor": zarr_array.compressor, + "filters": zarr_array.filters, + }) + # _FillValue needs to be in attributes, not encoding, so it will get # picked up by decode_cf if zarr_array.fill_value is not None: @@ -865,8 +878,14 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No # - Existing variables already have their attrs included in the consolidated metadata file. # - The size of dimensions can not be expanded, that would require a call using `append_dim` # which is mutually exclusive with `region` + kwargs = {} + if _zarr_v3(): + kwargs["store"] = self.zarr_group.store + else: + kwargs["store"] = self.zarr_group.chunk_store + zarr_array = zarr.open( - store=self.zarr_group.chunk_store, + **kwargs, path=f"{self.zarr_group.name}/{name}", write_empty_chunks=self._write_empty, ) @@ -928,6 +947,12 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No else: encoding["write_empty_chunks"] = self._write_empty + if "codec_pipeline" in encoding: + # In zarr v3, the keyword argument is + # `codecs` but the serialized form is `codec_pipeline` + pipeline = encoding.pop("codec_pipeline") + encoding["codecs"] = pipeline + zarr_array = self.zarr_group.create( name, shape=shape, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index b8f65bb0731..1ce1b701685 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2576,10 +2576,17 @@ def test_write_persistence_modes(self, group) -> None: def test_compressor_encoding(self) -> None: original = create_test_data() # specify a custom compressor - import zarr + from numcodecs.blosc import Blosc + + blosc_comp = Blosc(cname="zstd", clevel=3, shuffle=2) + + if have_zarr_v3: + encoding = {"codec_pipeline": [blosc_comp]} + else: + encoding = {"compressor": blosc_comp} + + save_kwargs = dict(encoding={"var1": encoding}) - blosc_comp = zarr.Blosc(cname="zstd", clevel=3, shuffle=2) - save_kwargs = dict(encoding={"var1": {"compressor": blosc_comp}}) with self.roundtrip(original, save_kwargs=save_kwargs) as ds: actual = ds["var1"].encoding["compressor"] # get_config returns a dictionary of compressor attributes diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 8ca4711acad..ff400880446 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -256,12 +256,12 @@ def test_to_zarr(self, tmpdir, simple_datatree): assert_equal(original_dt, roundtrip_dt) def test_zarr_encoding(self, tmpdir, simple_datatree): - import zarr + from numcodecs.blosc import Blosc filepath = tmpdir / "test.zarr" original_dt = simple_datatree - comp = {"compressor": zarr.Blosc(cname="zstd", clevel=3, shuffle=2)} + comp = {"compressor": Blosc(cname="zstd", clevel=3, shuffle=2)} enc = {"/set2": {var: comp for var in original_dt["/set2"].dataset.data_vars}} original_dt.to_zarr(filepath, encoding=enc) roundtrip_dt = open_datatree(filepath, engine="zarr") From 6c8d2bb8a1af862fbf309a233231654865afd47d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 23 Sep 2024 14:51:39 -0500 Subject: [PATCH 04/74] zarr-v3: update tests to avoid values equal to fillValue --- xarray/tests/test_backends.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 1ce1b701685..5eba2466122 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -498,7 +498,7 @@ def test_dataset_caching(self) -> None: @pytest.mark.filterwarnings("ignore:deallocating CachingFileManager") def test_roundtrip_None_variable(self) -> None: - expected = Dataset({None: (("x", "y"), [[0, 1], [2, 3]])}) + expected = Dataset({None: (("x", "y"), [[1, 1], [2, 3]])}) with self.roundtrip(expected) as actual: assert_identical(expected, actual) @@ -615,7 +615,7 @@ def test_roundtrip_example_1_netcdf(self) -> None: def test_roundtrip_coordinates(self) -> None: original = Dataset( - {"foo": ("x", [0, 1])}, {"x": [2, 3], "y": ("a", [42]), "z": ("x", [4, 5])} + {"foo": ("x", [1, 1])}, {"x": [2, 3], "y": ("a", [42]), "z": ("x", [4, 5])} ) with self.roundtrip(original) as actual: @@ -631,7 +631,7 @@ def test_roundtrip_coordinates(self) -> None: def test_roundtrip_global_coordinates(self) -> None: original = Dataset( - {"foo": ("x", [0, 1])}, {"x": [2, 3], "y": ("a", [42]), "z": ("x", [4, 5])} + {"foo": ("x", [1, 1])}, {"x": [2, 3], "y": ("a", [42]), "z": ("x", [4, 5])} ) with self.roundtrip(original) as actual: assert_identical(original, actual) @@ -647,8 +647,8 @@ def test_roundtrip_global_coordinates(self) -> None: assert attrs["coordinates"] == "foo" def test_roundtrip_coordinates_with_space(self) -> None: - original = Dataset(coords={"x": 0, "y z": 1}) - expected = Dataset({"y z": 1}, {"x": 0}) + original = Dataset(coords={"x": 1, "y z": 1}) + expected = Dataset({"y z": 1}, {"x": 1}) with pytest.warns(SerializationWarning): with self.roundtrip(original) as actual: assert_identical(expected, actual) @@ -840,7 +840,7 @@ def test_dropna(self) -> None: a = np.random.randn(4, 3) a[1, 1] = np.nan in_memory = xr.Dataset( - {"a": (("y", "x"), a)}, coords={"y": np.arange(4), "x": np.arange(3)} + {"a": (("y", "x"), a)}, coords={"y": np.arange(1, 5), "x": np.arange(1, 4)} ) assert_identical( @@ -1060,11 +1060,11 @@ def _create_cf_dataset(): ), ), dict( - latitude=("latitude", [0, 1], {"units": "degrees_north"}), - longitude=("longitude", [0, 1], {"units": "degrees_east"}), + latitude=("latitude", [-1, 1], {"units": "degrees_north"}), + longitude=("longitude", [-1, 1], {"units": "degrees_east"}), latlon=((), -1, {"grid_mapping_name": "latitude_longitude"}), - latitude_bnds=(("latitude", "bnds2"), [[0, 1], [1, 2]]), - longitude_bnds=(("longitude", "bnds2"), [[0, 1], [1, 2]]), + latitude_bnds=(("latitude", "bnds2"), [[-1, 1], [1, 2]]), + longitude_bnds=(("longitude", "bnds2"), [[-1, 1], [1, 2]]), areas=( ("latitude", "longitude"), [[1, 1], [1, 1]], @@ -1158,7 +1158,7 @@ def equals_latlon(obj): return obj == "lat lon" or obj == "lon lat" original = Dataset( - {"temp": ("x", [0, 1]), "precip": ("x", [0, -1])}, + {"temp": ("x", [1, 1]), "precip": ("x", [-1, -1])}, {"lat": ("x", [2, 3]), "lon": ("x", [4, 5])}, ) with self.roundtrip(original) as actual: @@ -3032,7 +3032,7 @@ def test_encoding_chunksizes(self) -> None: # see also test_encoding_chunksizes_unlimited nx, ny, nt = 4, 4, 5 original = xr.Dataset( - {}, coords={"x": np.arange(nx), "y": np.arange(ny), "t": np.arange(nt)} + {}, coords={"x": np.arange(nx) + 1 , "y": np.arange(ny) + 1, "t": np.arange(nt) + 1} ) original["v"] = xr.Variable(("x", "y", "t"), np.zeros((nx, ny, nt))) original = original.chunk({"t": 1, "x": 2, "y": 2}) @@ -5214,7 +5214,7 @@ def test_dataarray_to_netcdf_no_name_pathlib(self) -> None: @requires_zarr class TestDataArrayToZarr: def test_dataarray_to_zarr_no_name(self, tmp_store) -> None: - original_da = DataArray(np.arange(12).reshape((3, 4))) + original_da = DataArray(np.arange(1, 13).reshape((3, 4))) original_da.to_zarr(tmp_store) @@ -5820,7 +5820,7 @@ def test_pickle_open_mfdataset_dataset(): @requires_zarr def test_zarr_closing_internal_zip_store(): store_name = "tmp.zarr.zip" - original_da = DataArray(np.arange(12).reshape((3, 4))) + original_da = DataArray(np.arange(1, 13).reshape((3, 4))) original_da.to_zarr(store_name, mode="w") with open_dataarray(store_name, engine="zarr") as loaded_da: From 531b52153fcc3caba8ffa53aa1b77704995adbb0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 26 Sep 2024 20:21:35 -0500 Subject: [PATCH 05/74] Various test fixes --- xarray/tests/test_backends.py | 236 +++++++++++++++++++++++----------- 1 file changed, 158 insertions(+), 78 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 5eba2466122..8e6308a3c7f 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -103,22 +103,29 @@ except ImportError: pass -have_zarr_kvstore = False + try: - from zarr.storage import KVStore + import zarr +except ImportError: + have_zarr = False +else: + have_zarr = True + +if have_zarr: + have_zarr_v3 = backends.zarr._zarr_v3() have_zarr_kvstore = True -except ImportError: - KVStore = None -have_zarr_v3 = backends.zarr._zarr_v3() + if have_zarr_v3: + from zarr.store import LocalStore as DirectoryStore + from zarr.store import MemoryStore as KVStore + else: + from zarr.storage import DirectoryStore, KVStore +else: + have_zarr_v3 = False + have_zarr_kvstore = False + have_zarr_directory_store = False -try: - # as of Zarr v2.13 these imports require environment variable - # ZARR_V3_EXPERIMENTAL_API=1 - from zarr.store.memory import MemoryStore as KVStoreV3 -except ImportError: - KVStoreV3 = None ON_WINDOWS = sys.platform == "win32" default_value = object() @@ -2209,16 +2216,20 @@ def create_zarr_target(self): def create_store(self): with self.create_zarr_target() as store_target: yield backends.ZarrStore.open_group( - store_target, mode="w", **self.version_kwargs + store_target, mode="a", **self.version_kwargs ) def save(self, dataset, store_target, **kwargs): # type: ignore[override] + for k, v in dataset.variables.items(): + if v.dtype.kind in ("U", "O", "M", "b"): + raise pytest.skip(reason=f"Unsupported dtype {v} for variable: {k}") + return dataset.to_zarr(store=store_target, **kwargs, **self.version_kwargs) @contextlib.contextmanager - def open(self, store_target, **kwargs): + def open(self, path, **kwargs): with xr.open_dataset( - store_target, engine="zarr", **kwargs, **self.version_kwargs + path, engine="zarr", mode="r", **kwargs, **self.version_kwargs ) as ds: yield ds @@ -2249,12 +2260,11 @@ def test_roundtrip_consolidated(self, consolidated) -> None: assert_identical(expected, actual) def test_read_non_consolidated_warning(self) -> None: - if self.zarr_version > 2: - pytest.xfail("consolidated metadata is not supported for zarr v3 yet") - expected = create_test_data() with self.create_zarr_target() as store: - expected.to_zarr(store, consolidated=False, **self.version_kwargs) + self.save( + expected, store_target=store, consolidated=False, **self.version_kwargs + ) with pytest.warns( RuntimeWarning, match="Failed to open Zarr store with consolidated", @@ -2263,9 +2273,16 @@ def test_read_non_consolidated_warning(self) -> None: assert_identical(ds, expected) def test_non_existent_store(self) -> None: - with pytest.raises(FileNotFoundError, match=r"No such file or directory:"): + if have_zarr_v3: + exc = ValueError + msg = "store mode" + else: + exc = FileNotFoundError + msg = "No such file or directory:" + with pytest.raises(exc, match=msg): xr.open_zarr(f"{uuid.uuid4()}") + @pytest.mark.skipif(have_zarr_v3, reason="chunk_store not implemented in zarr v3") def test_with_chunkstore(self) -> None: expected = create_test_data() with ( @@ -2517,7 +2534,7 @@ def test_hidden_zarr_keys(self) -> None: assert self.DIMENSION_KEY not in expected[var].attrs # put it back and try removing from a variable - del zarr_group.var2.attrs[self.DIMENSION_KEY] + del zarr_group["var2"].attrs[self.DIMENSION_KEY] with pytest.raises(KeyError): with xr.decode_cf(store): pass @@ -2601,8 +2618,6 @@ def test_group(self) -> None: assert_identical(original, actual) def test_zarr_mode_w_overwrites_encoding(self) -> None: - import zarr - data = Dataset({"foo": ("x", [1.0, 1.0, 1.0])}) with self.create_zarr_target() as store: data.to_zarr( @@ -2714,10 +2729,14 @@ def test_check_encoding_is_consistent_after_append(self) -> None: # check encoding consistency with self.create_zarr_target() as store_target: - import zarr + import numcodecs - compressor = zarr.Blosc() - encoding = {"da": {"compressor": compressor}} + compressor = numcodecs.Blosc() + + if have_zarr_v3: + encoding = {"da": {"compressor": compressor}} + else: + encoding = {"da": {"codecs": [compressor]}} ds.to_zarr(store_target, mode="w", encoding=encoding, **self.version_kwargs) ds_to_append.to_zarr(store_target, append_dim="time", **self.version_kwargs) actual_ds = xr.open_dataset( @@ -3032,7 +3051,12 @@ def test_encoding_chunksizes(self) -> None: # see also test_encoding_chunksizes_unlimited nx, ny, nt = 4, 4, 5 original = xr.Dataset( - {}, coords={"x": np.arange(nx) + 1 , "y": np.arange(ny) + 1, "t": np.arange(nt) + 1} + {}, + coords={ + "x": np.arange(nx) + 1, + "y": np.arange(ny) + 1, + "t": np.arange(nt) + 1, + }, ) original["v"] = xr.Variable(("x", "y", "t"), np.zeros((nx, ny, nt))) original = original.chunk({"t": 1, "x": 2, "y": 2}) @@ -3128,23 +3152,38 @@ def test_chunked_cftime_datetime(self) -> None: @requires_zarr @pytest.mark.skipif(not have_zarr_v3, reason="requires zarr version 3") class TestInstrumentedZarrStore: - methods = [ - "__iter__", - "__contains__", - "__setitem__", - "__getitem__", - "listdir", - "list_prefix", - ] + + if have_zarr_v3: + methods = [ + "get", + "set", + "list_dir", + "list_prefix", + ] + else: + methods = [ + "__iter__", + "__contains__", + "__setitem__", + "__getitem__", + "listdir", + "list_prefix", + ] @contextlib.contextmanager def create_zarr_target(self): - import zarr + if not have_zarr: + pytest.skip("Instrumented tests only work on latest Zarr.") if Version(zarr.__version__) < Version("2.18.0"): pytest.skip("Instrumented tests only work on latest Zarr.") - store = KVStoreV3({}) + if have_zarr_v3: + kwargs = {"mode": "a"} + else: + kwargs = {} + + store = KVStore({}, **kwargs) yield store def make_patches(self, store): @@ -3152,7 +3191,7 @@ def make_patches(self, store): return { method: MagicMock( - f"KVStoreV3.{method}", + f"KVStore.{method}", side_effect=getattr(store, method), autospec=True, ) @@ -3175,47 +3214,77 @@ def check_requests(self, expected, patches): assert summary[k] <= expected[k], (k, summary) def test_append(self) -> None: - original = Dataset({"foo": ("x", [1])}, coords={"x": [0]}) + original = Dataset({"foo": ("x", [1])}, coords={"x": [-1]}) modified = Dataset({"foo": ("x", [2])}, coords={"x": [1]}) + with self.create_zarr_target() as store: - expected = { - "iter": 2, - "contains": 9, - "setitem": 9, - "getitem": 6, - "listdir": 2, - "list_prefix": 2, - } + if have_zarr_v3: + # TOOD: verify these + expected = { + "set": 17, + "get": 12, + "list_dir": 3, + "list_prefix": 0, + } + else: + expected = { + "iter": 2, + "contains": 9, + "setitem": 9, + "getitem": 6, + "listdir": 2, + "list_prefix": 2, + } + patches = self.make_patches(store) - with patch.multiple(KVStoreV3, **patches): + with patch.multiple(KVStore, **patches): original.to_zarr(store) self.check_requests(expected, patches) patches = self.make_patches(store) # v2024.03.0: {'iter': 6, 'contains': 2, 'setitem': 5, 'getitem': 10, 'listdir': 6, 'list_prefix': 0} # 6057128b: {'iter': 5, 'contains': 2, 'setitem': 5, 'getitem': 10, "listdir": 5, "list_prefix": 0} - expected = { - "iter": 2, - "contains": 2, - "setitem": 5, - "getitem": 6, - "listdir": 2, - "list_prefix": 0, - } - with patch.multiple(KVStoreV3, **patches): + if have_zarr_v3: + expected = { + "set": 10, + "get": 6, + "list_dir": 2, + "list_prefix": 0, + } + else: + expected = { + "iter": 2, + "contains": 2, + "setitem": 5, + "getitem": 6, + "listdir": 2, + "list_prefix": 0, + } + + with patch.multiple(KVStore, **patches): modified.to_zarr(store, mode="a", append_dim="x") self.check_requests(expected, patches) patches = self.make_patches(store) - expected = { - "iter": 2, - "contains": 2, - "setitem": 5, - "getitem": 6, - "listdir": 2, - "list_prefix": 0, - } - with patch.multiple(KVStoreV3, **patches): + + if have_zarr_v3: + expected = { + "set": 10, + "get": 6, + "list_dir": 2, + "list_prefix": 0, + } + else: + expected = { + "iter": 2, + "contains": 2, + "setitem": 5, + "getitem": 6, + "listdir": 2, + "list_prefix": 0, + } + + with patch.multiple(KVStore, **patches): modified.to_zarr(store, mode="a-", append_dim="x") self.check_requests(expected, patches) @@ -3237,7 +3306,7 @@ def test_region_write(self) -> None: "list_prefix": 4, } patches = self.make_patches(store) - with patch.multiple(KVStoreV3, **patches): + with patch.multiple(KVStore, **patches): ds.to_zarr(store, mode="w", compute=False) self.check_requests(expected, patches) @@ -3252,7 +3321,7 @@ def test_region_write(self) -> None: "list_prefix": 0, } patches = self.make_patches(store) - with patch.multiple(KVStoreV3, **patches): + with patch.multiple(KVStore, **patches): ds.to_zarr(store, region={"x": slice(None)}) self.check_requests(expected, patches) @@ -3267,7 +3336,7 @@ def test_region_write(self) -> None: "list_prefix": 0, } patches = self.make_patches(store) - with patch.multiple(KVStoreV3, **patches): + with patch.multiple(KVStore, **patches): ds.to_zarr(store, region="auto") self.check_requests(expected, patches) @@ -3280,18 +3349,33 @@ def test_region_write(self) -> None: "list_prefix": 0, } patches = self.make_patches(store) - with patch.multiple(KVStoreV3, **patches): + with patch.multiple(KVStore, **patches): with open_dataset(store, engine="zarr") as actual: assert_identical(actual, ds) self.check_requests(expected, patches) +if have_zarr_v3: + ZARR_FORMATS = [2, 3] +else: + ZARR_FORMATS = [2] + + +@pytest.fixture(scope="module", params=ZARR_FORMATS) +def default_zarr_version(request) -> Generator[None, None]: + if have_zarr_v3: + with zarr.config.set(default_zarr_version=request.param): + yield + yield + + @requires_zarr +@pytest.mark.usefixtures("default_zarr_version") class TestZarrDictStore(ZarrBase): @contextlib.contextmanager def create_zarr_target(self): - if have_zarr_kvstore: - yield KVStore({}) + if have_zarr_v3: + yield zarr.store.MemoryStore({}, mode="a") else: yield {} @@ -3310,7 +3394,7 @@ def create_zarr_target(self): @contextlib.contextmanager def create_store(self): with self.create_zarr_target() as store_target: - group = backends.ZarrStore.open_group(store_target, mode="w") + group = backends.ZarrStore.open_group(store_target, mode="a") # older Zarr versions do not have the _store_version attribute if have_zarr_v3: # verify that a v2 store was created @@ -3409,9 +3493,6 @@ def test_avoid_excess_metadata_calls(self) -> None: https://github.com/pydata/xarray/issues/8290 """ - - import zarr - ds = xr.Dataset(data_vars={"test": (("Z",), np.array([123]).reshape(1))}) # The call to retrieve metadata performs a group lookup. We patch Group.__getitem__ @@ -3454,7 +3535,7 @@ def test_roundtrip_coordinates_with_space(self): class TestZarrKVStoreV3(ZarrBaseV3): @contextlib.contextmanager def create_zarr_target(self): - yield KVStoreV3({}) + yield KVStore({}, mode="a") @pytest.mark.skipif(not have_zarr_v3, reason="requires zarr version 3") @@ -3462,7 +3543,7 @@ class TestZarrDirectoryStoreV3(ZarrBaseV3): @contextlib.contextmanager def create_zarr_target(self): with create_tmp_file(suffix=".zr3") as tmp: - yield DirectoryStoreV3(tmp) + yield DirectoryStore(tmp, mode="a") @pytest.mark.skipif(not have_zarr_v3, reason="requires zarr version 3") @@ -5528,7 +5609,6 @@ def test_extract_zarr_variable_encoding() -> None: @pytest.mark.filterwarnings("ignore:deallocating CachingFileManager") def test_open_fsspec() -> None: import fsspec - import zarr if not hasattr(zarr.storage, "FSStore") or not hasattr( zarr.storage.FSStore, "getitems" From 849df40e798b9bad27a349982d902b41a56c9336 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Sep 2024 08:52:07 -0500 Subject: [PATCH 06/74] zarr_version fixes * removed open_consolidated workarounds * removed _store_version check * pass through zarr_version --- xarray/backends/zarr.py | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 44aa4462e37..55dcfd5f916 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -5,7 +5,7 @@ import os import warnings from collections.abc import Callable, Iterable -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Literal import numpy as np import packaging.version @@ -56,6 +56,7 @@ def _zarr_v3() -> bool: # need some special secret attributes to tell us the dimensions DIMENSION_KEY = "_ARRAY_DIMENSIONS" +ZarrFormat = Literal[2, 3] def encode_zarr_attr_value(value): @@ -1436,10 +1437,6 @@ def _get_open_params( if isinstance(store, os.PathLike): store = os.fspath(store) - if zarr_version is None: - # default to 2 if store doesn't specify it's version (e.g. a path) - zarr_version = getattr(store, "_store_version", 2) - open_kwargs = dict( # mode='a-' is a handcrafted xarray specialty mode="a" if mode == "a-" else mode, @@ -1447,19 +1444,11 @@ def _get_open_params( path=group, ) open_kwargs["storage_options"] = storage_options - if zarr_version > 2: - open_kwargs["zarr_version"] = zarr_version - - if consolidated or consolidate_on_close: - raise ValueError( - "consolidated metadata has not been implemented for zarr " - f"version {zarr_version} yet. Set consolidated=False for " - f"zarr version {zarr_version}. See also " - "https://github.com/zarr-developers/zarr-specs/issues/136" - ) - if consolidated is None: - consolidated = False + if _zarr_v3(): + open_kwargs["zarr_format"] = zarr_version + else: + open_kwargs["zarr_version"] = zarr_version if chunk_store is not None: open_kwargs["chunk_store"] = chunk_store From 88bd64b1a2880aa440449043b5d511c6d05d2fb2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Sep 2024 09:02:02 -0500 Subject: [PATCH 07/74] fixup! zarr-v3: filters / compressors -> codecs --- xarray/tests/test_backends.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 8e6308a3c7f..f09757693ca 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2734,9 +2734,9 @@ def test_check_encoding_is_consistent_after_append(self) -> None: compressor = numcodecs.Blosc() if have_zarr_v3: - encoding = {"da": {"compressor": compressor}} - else: encoding = {"da": {"codecs": [compressor]}} + else: + encoding = {"da": {"compressor": compressor}} ds.to_zarr(store_target, mode="w", encoding=encoding, **self.version_kwargs) ds_to_append.to_zarr(store_target, append_dim="time", **self.version_kwargs) actual_ds = xr.open_dataset( From ef1549a5064f61cd004934c03da12469984a078b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Sep 2024 09:25:19 -0500 Subject: [PATCH 08/74] fixup! fixup! zarr-v3: filters / compressors -> codecs --- xarray/backends/zarr.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 55dcfd5f916..12b6053fe55 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -43,7 +43,6 @@ from xarray.core.datatree import DataTree - @functools.lru_cache def _zarr_v3() -> bool: try: @@ -90,8 +89,10 @@ def __init__(self, zarr_array): self.shape = self._array.shape # preserve vlen string object dtype (GH 7328) - if not _zarr_v3() and self._array.filters is not None and any( - [filt.codec_id == "vlen-utf8" for filt in self._array.filters] + if ( + not _zarr_v3() + and self._array.filters is not None + and any([filt.codec_id == "vlen-utf8" for filt in self._array.filters]) ): dtype = coding.strings.create_vlen_dtype(str) else: @@ -634,17 +635,23 @@ def open_store_variable(self, name, zarr_array=None): } if _zarr_v3() and zarr_array.metadata.zarr_format == 3: - encoding["codec_pipeline"] = [x.to_dict() for x in zarr_array.metadata.codecs] + encoding["codec_pipeline"] = [ + x.to_dict() for x in zarr_array.metadata.codecs + ] elif _zarr_v3(): - encoding.update({ - "compressor": zarr_array.metadata.compressor, - "filters": zarr_array.metadata.filters, - }) + encoding.update( + { + "compressor": zarr_array.metadata.compressor, + "filters": zarr_array.metadata.filters, + } + ) else: - encoding.update({ - "compressor": zarr_array.compressor, - "filters": zarr_array.filters, - }) + encoding.update( + { + "compressor": zarr_array.compressor, + "filters": zarr_array.filters, + } + ) # _FillValue needs to be in attributes, not encoding, so it will get # picked up by decode_cf From 20c22bd4d601f1443503c45a4c83b422e86be2f0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Sep 2024 09:51:49 -0500 Subject: [PATCH 09/74] fixup --- pyproject.toml | 1 + xarray/tests/test_backends.py | 1 + 2 files changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 60fbff9378d..bee8b0481f3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -116,6 +116,7 @@ module = [ "nc_time_axis.*", "netCDF4.*", "netcdftime.*", + "numcodecs.*", "opt_einsum.*", "pint.*", "pooch.*", diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index f09757693ca..5e6ef72db6d 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2273,6 +2273,7 @@ def test_read_non_consolidated_warning(self) -> None: assert_identical(ds, expected) def test_non_existent_store(self) -> None: + exc: type[Exception] if have_zarr_v3: exc = ValueError msg = "store mode" From 6087e5e422679f0cd4504309d5d4df952686ba05 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 30 Sep 2024 11:44:14 -0500 Subject: [PATCH 10/74] path / key normalization in set_variables --- xarray/backends/zarr.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 12b6053fe55..a5c4d7a1604 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -892,9 +892,13 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No else: kwargs["store"] = self.zarr_group.chunk_store + # TODO: see if zarr should normalize these strings. zarr_array = zarr.open( **kwargs, - path=f"{self.zarr_group.name}/{name}", + # path=f"{self.zarr_group.name}/{name}", + path="/".join([self.zarr_group.name.rstrip("/"), name]).lstrip( + "/" + ), write_empty_chunks=self._write_empty, ) else: From 15fe55ec15d350482ef57d2bd8416e2010f38f06 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 1 Oct 2024 11:55:13 -0500 Subject: [PATCH 11/74] fixes --- xarray/backends/api.py | 7 -- xarray/backends/zarr.py | 22 ++--- xarray/tests/test_backends.py | 156 ++++++++++++++++------------------ 3 files changed, 84 insertions(+), 101 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index e9e3e9beacd..01d92be1daa 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1732,13 +1732,6 @@ def to_zarr( # validate Dataset keys, DataArray names _validate_dataset_names(dataset) - if zarr_version is None: - # default to 2 if store doesn't specify its version (e.g. a path) - zarr_version = int(getattr(store, "_store_version", 2)) - - if consolidated is None and zarr_version > 2: - consolidated = False - if mode == "r+": already_consolidated = consolidated consolidate_on_close = False diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index a5c4d7a1604..3e2c734a6b1 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -334,12 +334,12 @@ def extract_zarr_variable_encoding( safe_to_drop = {"source", "original_shape"} valid_encodings = { + "codecs", "chunks", "compressor", "filters", "cache_metadata", "write_empty_chunks", - "codec_pipeline", } for k in safe_to_drop: @@ -635,9 +635,7 @@ def open_store_variable(self, name, zarr_array=None): } if _zarr_v3() and zarr_array.metadata.zarr_format == 3: - encoding["codec_pipeline"] = [ - x.to_dict() for x in zarr_array.metadata.codecs - ] + encoding["codecs"] = [x.to_dict() for x in zarr_array.metadata.codecs] elif _zarr_v3(): encoding.update( { @@ -959,10 +957,8 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No else: encoding["write_empty_chunks"] = self._write_empty - if "codec_pipeline" in encoding: - # In zarr v3, the keyword argument is - # `codecs` but the serialized form is `codec_pipeline` - pipeline = encoding.pop("codec_pipeline") + if "codecs" in encoding: + pipeline = encoding.pop("codecs") encoding["codecs"] = pipeline zarr_array = self.zarr_group.create( @@ -1466,10 +1462,16 @@ def _get_open_params( if consolidated is None: consolidated = False + if _zarr_v3(): + missing_exc: type[Exception] = ValueError + else: + missing_exc = zarr.errors.GroupNotFoundError + if consolidated is None: try: zarr_group = zarr.open_consolidated(store, **open_kwargs) - except KeyError: + except (ValueError, KeyError): + # ValueError in zarr-python 3.x, KeyError in 2.x. try: zarr_group = zarr.open_group(store, **open_kwargs) warnings.warn( @@ -1487,7 +1489,7 @@ def _get_open_params( RuntimeWarning, stacklevel=stacklevel, ) - except zarr.errors.GroupNotFoundError as err: + except missing_exc as err: raise FileNotFoundError( f"No such file or directory: '{store}'" ) from err diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 5e6ef72db6d..e690eedf2a7 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -117,14 +117,32 @@ have_zarr_kvstore = True if have_zarr_v3: - from zarr.store import LocalStore as DirectoryStore - from zarr.store import MemoryStore as KVStore + from zarr.storage import MemoryStore as KVStore + + ZARR_FORMATS = [2, 3] else: - from zarr.storage import DirectoryStore, KVStore + from zarr.storage import KVStore + + ZARR_FORMATS = [2] else: have_zarr_v3 = False have_zarr_kvstore = False have_zarr_directory_store = False + ZARR_FORMATS = [] + + +@pytest.fixture(scope="module", params=ZARR_FORMATS) +def default_zarr_version(request) -> Generator[None, None]: + if have_zarr_v3: + with zarr.config.set(default_zarr_version=request.param): + yield + else: + yield + + +def skip_if_zarr_format_3(reason: str): + if have_zarr_v3 and zarr.config["default_zarr_version"] == 3: + pytest.xfail(reason=f"Unsupported with zarr_format=3: {reason}") ON_WINDOWS = sys.platform == "win32" @@ -1295,6 +1313,7 @@ def test_default_fill_value(self) -> None: with warnings.catch_warnings(): warnings.filterwarnings("ignore", ".*floating point data as an integer") with self.roundtrip(ds, save_kwargs=kwargs) as actual: + # XXX: zarr v3 always writes fill_value so this fails. assert "_FillValue" not in actual.x.encoding assert ds.x.encoding == {} @@ -2204,6 +2223,7 @@ def test_roundtrip_coordinates(self) -> None: @requires_zarr +@pytest.mark.usefixtures("default_zarr_version") class ZarrBase(CFEncodedBase): DIMENSION_KEY = "_ARRAY_DIMENSIONS" zarr_version = 2 @@ -2220,9 +2240,10 @@ def create_store(self): ) def save(self, dataset, store_target, **kwargs): # type: ignore[override] - for k, v in dataset.variables.items(): - if v.dtype.kind in ("U", "O", "M", "b"): - raise pytest.skip(reason=f"Unsupported dtype {v} for variable: {k}") + if have_zarr_v3 and zarr.config.config["default_zarr_version"] == 3: + for k, v in dataset.variables.items(): + if v.dtype.kind in ("U", "O", "M", "b"): + pytest.skip(reason=f"Unsupported dtype {v} for variable: {k}") return dataset.to_zarr(store=store_target, **kwargs, **self.version_kwargs) @@ -2248,8 +2269,6 @@ def roundtrip( @pytest.mark.parametrize("consolidated", [False, True, None]) def test_roundtrip_consolidated(self, consolidated) -> None: - if consolidated and self.zarr_version > 2: - pytest.xfail("consolidated metadata is not supported for zarr v3 yet") expected = create_test_data() with self.roundtrip( expected, @@ -2273,14 +2292,7 @@ def test_read_non_consolidated_warning(self) -> None: assert_identical(ds, expected) def test_non_existent_store(self) -> None: - exc: type[Exception] - if have_zarr_v3: - exc = ValueError - msg = "store mode" - else: - exc = FileNotFoundError - msg = "No such file or directory:" - with pytest.raises(exc, match=msg): + with pytest.raises(FileNotFoundError, match="No such file or directory"): xr.open_zarr(f"{uuid.uuid4()}") @pytest.mark.skipif(have_zarr_v3, reason="chunk_store not implemented in zarr v3") @@ -2567,7 +2579,7 @@ def test_write_persistence_modes(self, group) -> None: self.save(original, store, mode="w", group=group) with self.open(store, group=group) as actual: assert_identical(original, actual) - with pytest.raises(ValueError): + with pytest.raises((ValueError, FileExistsError)): self.save(original, store, mode="w-") # check append mode for normal write @@ -2598,8 +2610,8 @@ def test_compressor_encoding(self) -> None: blosc_comp = Blosc(cname="zstd", clevel=3, shuffle=2) - if have_zarr_v3: - encoding = {"codec_pipeline": [blosc_comp]} + if have_zarr_v3 and zarr.config.config["default_zarr_version"] == 3: + encoding = {"codecs": [blosc_comp]} else: encoding = {"compressor": blosc_comp} @@ -2734,16 +2746,20 @@ def test_check_encoding_is_consistent_after_append(self) -> None: compressor = numcodecs.Blosc() - if have_zarr_v3: - encoding = {"da": {"codecs": [compressor]}} + if have_zarr_v3 and zarr.config.config["default_zarr_version"] == 3: + encoding_key = "codecs" + encoding_value = [compressor] else: - encoding = {"da": {"compressor": compressor}} + encoding_key = "compressor" + encoding_value = compressor + + encoding = {"da": {encoding_key: encoding_value}} ds.to_zarr(store_target, mode="w", encoding=encoding, **self.version_kwargs) ds_to_append.to_zarr(store_target, append_dim="time", **self.version_kwargs) actual_ds = xr.open_dataset( store_target, engine="zarr", **self.version_kwargs ) - actual_encoding = actual_ds["da"].encoding["compressor"] + actual_encoding = actual_ds["da"].encoding[encoding_key] assert actual_encoding.get_config() == compressor.get_config() assert_identical( xr.open_dataset( @@ -2876,8 +2892,6 @@ def test_no_warning_from_open_emptydim_with_chunks(self) -> None: def test_write_region(self, consolidated, compute, use_dask, write_empty) -> None: if (use_dask or not compute) and not has_dask: pytest.skip("requires dask") - if consolidated and self.zarr_version > 2: - pytest.xfail("consolidated metadata is not supported for zarr v3 yet") zeros = Dataset({"u": (("x",), np.zeros(10))}) nonzeros = Dataset({"u": (("x",), np.arange(1, 11))}) @@ -3149,6 +3163,38 @@ def test_chunked_cftime_datetime(self) -> None: assert original[name].chunks == actual_var.chunks assert original.chunks == actual.chunks + def test_write_store(self) -> None: + skip_if_zarr_format_3(reason="unsupported dtypes") + return super().test_write_store() + + def test_roundtrip_endian(self) -> None: + skip_if_zarr_format_3(reason="unsupported dtypes") + return super().test_roundtrip_endian() + + def test_roundtrip_bytes_with_fill_value(self) -> None: + skip_if_zarr_format_3(reason="unsupported dtypes") + return super().test_roundtrip_bytes_with_fill_value() + + def test_default_fill_value(self) -> None: + skip_if_zarr_format_3(reason="fill_value always written") + return super().test_default_fill_value() + + def test_explicitly_omit_fill_value(self) -> None: + skip_if_zarr_format_3(reason="fill_value always written") + return super().test_explicitly_omit_fill_value() + + def test_explicitly_omit_fill_value_via_encoding_kwarg(self) -> None: + skip_if_zarr_format_3(reason="fill_value always written") + return super().test_explicitly_omit_fill_value_via_encoding_kwarg() + + def test_explicitly_omit_fill_value_in_coord(self) -> None: + skip_if_zarr_format_3(reason="fill_value always written") + return super().test_explicitly_omit_fill_value_in_coord() + + def test_explicitly_omit_fill_value_in_coord_via_encoding_kwarg(self) -> None: + skip_if_zarr_format_3(reason="fill_value always written") + return super().test_explicitly_omit_fill_value_in_coord_via_encoding_kwarg() + @requires_zarr @pytest.mark.skipif(not have_zarr_v3, reason="requires zarr version 3") @@ -3356,27 +3402,12 @@ def test_region_write(self) -> None: self.check_requests(expected, patches) -if have_zarr_v3: - ZARR_FORMATS = [2, 3] -else: - ZARR_FORMATS = [2] - - -@pytest.fixture(scope="module", params=ZARR_FORMATS) -def default_zarr_version(request) -> Generator[None, None]: - if have_zarr_v3: - with zarr.config.set(default_zarr_version=request.param): - yield - yield - - @requires_zarr -@pytest.mark.usefixtures("default_zarr_version") class TestZarrDictStore(ZarrBase): @contextlib.contextmanager def create_zarr_target(self): if have_zarr_v3: - yield zarr.store.MemoryStore({}, mode="a") + yield zarr.storage.MemoryStore({}, mode="a") else: yield {} @@ -3396,10 +3427,6 @@ def create_zarr_target(self): def create_store(self): with self.create_zarr_target() as store_target: group = backends.ZarrStore.open_group(store_target, mode="a") - # older Zarr versions do not have the _store_version attribute - if have_zarr_v3: - # verify that a v2 store was created - assert group.zarr_group.store._store_version == 2 yield group @@ -3520,45 +3547,6 @@ def test_avoid_excess_metadata_calls(self) -> None: assert mock.call_count == call_count -class ZarrBaseV3(ZarrBase): - zarr_version = 3 - - def test_roundtrip_coordinates_with_space(self): - original = Dataset(coords={"x": 0, "y z": 1}) - with pytest.warns(SerializationWarning): - # v3 stores do not allow spaces in the key name - with pytest.raises(ValueError): - with self.roundtrip(original): - pass - - -@pytest.mark.skipif(not have_zarr_v3, reason="requires zarr version 3") -class TestZarrKVStoreV3(ZarrBaseV3): - @contextlib.contextmanager - def create_zarr_target(self): - yield KVStore({}, mode="a") - - -@pytest.mark.skipif(not have_zarr_v3, reason="requires zarr version 3") -class TestZarrDirectoryStoreV3(ZarrBaseV3): - @contextlib.contextmanager - def create_zarr_target(self): - with create_tmp_file(suffix=".zr3") as tmp: - yield DirectoryStore(tmp, mode="a") - - -@pytest.mark.skipif(not have_zarr_v3, reason="requires zarr version 3") -class TestZarrDirectoryStoreV3FromPath(TestZarrDirectoryStoreV3): - # Must specify zarr_version=3 to get a v3 store because create_zarr_target - # is a string path. - version_kwargs = {"zarr_version": 3} - - @contextlib.contextmanager - def create_zarr_target(self): - with create_tmp_file(suffix=".zr3") as tmp: - yield tmp - - @requires_zarr @requires_fsspec def test_zarr_storage_options() -> None: From 8e06bc711e5e2d9d4a9282188ec0c8e36d51903d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 1 Oct 2024 14:37:01 -0500 Subject: [PATCH 12/74] workaround nested consolidated metadata --- xarray/backends/zarr.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 3e2c734a6b1..27de355b467 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -820,7 +820,11 @@ def store( variables_to_set, check_encoding_set, writer, unlimited_dims=unlimited_dims ) if self._consolidate_on_close: - zarr.consolidate_metadata(self.zarr_group.store) + kwargs = {} + if _zarr_v3(): + # https://github.com/zarr-developers/zarr-python/pull/2113#issuecomment-2386718323 + kwargs["path"] = self.zarr_group.name.lstrip("/") + zarr.consolidate_metadata(self.zarr_group.store, **kwargs) def sync(self): pass From f8c427fa95f0a536b3a06bc83f2692a82221e11d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 1 Oct 2024 14:42:33 -0500 Subject: [PATCH 13/74] test: avoid fill_value --- xarray/tests/test_backends.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 035045e858a..a16727d6ad8 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2985,7 +2985,7 @@ def test_write_preexisting_override_metadata(self) -> None: assert_identical(actual, only_new_data) def test_write_region_errors(self) -> None: - data = Dataset({"u": (("x",), np.arange(5))}) + data = Dataset({"u": (("x",), np.arange(1, 6))}) data2 = Dataset({"u": (("x",), np.array([10, 11]))}) @contextlib.contextmanager @@ -2997,7 +2997,7 @@ def setup_and_verify_store(expected=data): assert_identical(actual, expected) # verify the base case works - expected = Dataset({"u": (("x",), np.array([10, 11, 2, 3, 4]))}) + expected = Dataset({"u": (("x",), np.array([10, 11, 3, 4, 5]))}) with setup_and_verify_store(expected) as store: data2.to_zarr(store, region={"x": slice(2)}, **self.version_kwargs) From 594d36db4e2ec0a7f33e7e424e373909d5d1378d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 2 Oct 2024 07:19:54 -0500 Subject: [PATCH 14/74] test: Adjust call counts --- xarray/tests/test_backends.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a16727d6ad8..9415549a3d5 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3294,8 +3294,8 @@ def test_append(self) -> None: if have_zarr_v3: expected = { "set": 10, - "get": 6, - "list_dir": 2, + "get": 8, + "list_dir": 1, "list_prefix": 0, } else: @@ -3317,7 +3317,7 @@ def test_append(self) -> None: if have_zarr_v3: expected = { "set": 10, - "get": 6, + "get": 8, "list_dir": 2, "list_prefix": 0, } From 046d37ea2ec7a15cfdd148491676179551ee836a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 2 Oct 2024 07:20:08 -0500 Subject: [PATCH 15/74] zarr-python 3.x Array.resize doesn't mutate --- xarray/backends/zarr.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 20c3add54d5..2f7fe8702b1 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -915,7 +915,10 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No new_shape = list(zarr_array.shape) new_shape[append_axis] += v.shape[append_axis] - zarr_array.resize(new_shape) + if _zarr_v3(): + zarr_array = zarr_array.resize(new_shape) + else: + zarr_array.resize(new_shape) zarr_shape = zarr_array.shape From 6b0ca62e483c6f699db09865d58835958a3e1735 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 2 Oct 2024 07:29:52 -0500 Subject: [PATCH 16/74] test compatibility - skip write_empty_chunks on 3.x - update patch targets --- xarray/tests/test_backends.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 9415549a3d5..fa16f196a07 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3460,6 +3460,9 @@ def roundtrip_dir( @pytest.mark.parametrize("consolidated", [True, False, None]) @pytest.mark.parametrize("write_empty", [True, False, None]) + @pytest.mark.skipif( + have_zarr_v3, reason="zarr-python 3.x removed write_empty_chunks" + ) def test_write_empty( self, consolidated: bool | None, write_empty: bool | None ) -> None: @@ -3527,13 +3530,19 @@ def test_avoid_excess_metadata_calls(self) -> None: # so that we can inspect calls to this method - specifically count of calls. # Use of side_effect means that calls are passed through to the original method # rather than a mocked method. - Group = zarr.hierarchy.Group - with ( - self.create_zarr_target() as store, - patch.object( + + if have_zarr_v3: + Group = zarr.AsyncGroup + patched = patch.object( + Group, "getitem", side_effect=Group.getitem, autospec=True + ) + else: + Group = zarr.Group + patched = patch.object( Group, "__getitem__", side_effect=Group.__getitem__, autospec=True - ) as mock, - ): + ) + + with self.create_zarr_target() as store, patched as mock: ds.to_zarr(store, mode="w") # We expect this to request array metadata information, so call_count should be == 1, From d315583f952db058338a3d15aab373efabdcc3dd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 2 Oct 2024 11:15:10 -0500 Subject: [PATCH 17/74] skip ZipStore with_mode --- xarray/tests/test_backends.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index fa16f196a07..0951410525a 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5292,7 +5292,13 @@ def test_dataarray_to_netcdf_no_name_pathlib(self) -> None: @requires_zarr class TestDataArrayToZarr: + + def skip_if_zarr_python_3_and_zip_store(self, store) -> None: + if isinstance(store, zarr.storage.ZipStore): + pytest.skip(reason="zarr-python 3.x doesn't support reopening ZipStore with a new mode.") + def test_dataarray_to_zarr_no_name(self, tmp_store) -> None: + skip_if_zarr_format_3(tmp_store) original_da = DataArray(np.arange(1, 13).reshape((3, 4))) original_da.to_zarr(tmp_store) @@ -5301,7 +5307,8 @@ def test_dataarray_to_zarr_no_name(self, tmp_store) -> None: assert_identical(original_da, loaded_da) def test_dataarray_to_zarr_with_name(self, tmp_store) -> None: - original_da = DataArray(np.arange(12).reshape((3, 4)), name="test") + skip_if_zarr_format_3(tmp_store) + original_da = DataArray(np.arange(12).reshape((3, 4)) + 1, name="test") original_da.to_zarr(tmp_store) @@ -5309,6 +5316,7 @@ def test_dataarray_to_zarr_with_name(self, tmp_store) -> None: assert_identical(original_da, loaded_da) def test_dataarray_to_zarr_coord_name_clash(self, tmp_store) -> None: + skip_if_zarr_format_3(tmp_store) original_da = DataArray( np.arange(12).reshape((3, 4)), dims=["x", "y"], name="x" ) @@ -5319,7 +5327,8 @@ def test_dataarray_to_zarr_coord_name_clash(self, tmp_store) -> None: assert_identical(original_da, loaded_da) def test_open_dataarray_options(self, tmp_store) -> None: - data = DataArray(np.arange(5), coords={"y": ("x", range(5))}, dims=["x"]) + skip_if_zarr_format_3(tmp_store) + data = DataArray(np.arange(5) + 1, coords={"y": ("x", range(1, 6))}, dims=["x"]) data.to_zarr(tmp_store) @@ -5332,6 +5341,7 @@ def test_dataarray_to_zarr_compute_false(self, tmp_store) -> None: from dask.delayed import Delayed original_da = DataArray(np.arange(12).reshape((3, 4))) + skip_if_zarr_format_3(tmp_store) output = original_da.to_zarr(tmp_store, compute=False) assert isinstance(output, Delayed) From 389cc82ebabf4265ee2700b3fcddc9366f4a5aa7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 2 Oct 2024 11:15:50 -0500 Subject: [PATCH 18/74] test: more fill_value avoidance --- xarray/tests/test_backends.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 0951410525a..cb167f61069 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5295,7 +5295,9 @@ class TestDataArrayToZarr: def skip_if_zarr_python_3_and_zip_store(self, store) -> None: if isinstance(store, zarr.storage.ZipStore): - pytest.skip(reason="zarr-python 3.x doesn't support reopening ZipStore with a new mode.") + pytest.skip( + reason="zarr-python 3.x doesn't support reopening ZipStore with a new mode." + ) def test_dataarray_to_zarr_no_name(self, tmp_store) -> None: skip_if_zarr_format_3(tmp_store) @@ -5318,7 +5320,7 @@ def test_dataarray_to_zarr_with_name(self, tmp_store) -> None: def test_dataarray_to_zarr_coord_name_clash(self, tmp_store) -> None: skip_if_zarr_format_3(tmp_store) original_da = DataArray( - np.arange(12).reshape((3, 4)), dims=["x", "y"], name="x" + np.arange(12).reshape((3, 4)) + 1, dims=["x", "y"], name="x" ) original_da.to_zarr(tmp_store) @@ -5340,8 +5342,8 @@ def test_open_dataarray_options(self, tmp_store) -> None: def test_dataarray_to_zarr_compute_false(self, tmp_store) -> None: from dask.delayed import Delayed - original_da = DataArray(np.arange(12).reshape((3, 4))) skip_if_zarr_format_3(tmp_store) + original_da = DataArray(np.arange(12).reshape((3, 4)) + 1) output = original_da.to_zarr(tmp_store, compute=False) assert isinstance(output, Delayed) @@ -5916,10 +5918,11 @@ def test_zarr_closing_internal_zip_store(): @requires_zarr +@pytest.mark.usefixtures("default_zarr_version") class TestZarrRegionAuto: def test_zarr_region_auto_all(self, tmp_path): - x = np.arange(0, 50, 10) - y = np.arange(0, 20, 2) + x = np.arange(0, 50, 10) + 1 + y = np.arange(0, 20, 2) + 1 data = np.ones((5, 10)) ds = xr.Dataset( { @@ -5942,8 +5945,8 @@ def test_zarr_region_auto_all(self, tmp_path): assert_identical(ds_updated, expected) def test_zarr_region_auto_mixed(self, tmp_path): - x = np.arange(0, 50, 10) - y = np.arange(0, 20, 2) + x = np.arange(0, 50, 10) + 1 + y = np.arange(0, 20, 2) + 1 data = np.ones((5, 10)) ds = xr.Dataset( { @@ -5968,8 +5971,8 @@ def test_zarr_region_auto_mixed(self, tmp_path): assert_identical(ds_updated, expected) def test_zarr_region_auto_noncontiguous(self, tmp_path): - x = np.arange(0, 50, 10) - y = np.arange(0, 20, 2) + x = np.arange(0, 50, 10) + 1 + y = np.arange(0, 20, 2) + 1 data = np.ones((5, 10)) ds = xr.Dataset( { From 1fe409ab6428373aa8c9608123a69faa983372bb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 2 Oct 2024 11:38:44 -0500 Subject: [PATCH 19/74] test: more fill_value avoidance --- xarray/tests/test_backends.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index cb167f61069..fb521919a90 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5902,12 +5902,14 @@ def test_raise_writing_to_nczarr(self, mode) -> None: @requires_netCDF4 @requires_dask +@pytest.mark.usefixtures("default_zarr_version") def test_pickle_open_mfdataset_dataset(): with open_example_mfdataset(["bears.nc"]) as ds: assert_identical(ds, pickle.loads(pickle.dumps(ds))) @requires_zarr +@pytest.mark.usefixtures("default_zarr_version") def test_zarr_closing_internal_zip_store(): store_name = "tmp.zarr.zip" original_da = DataArray(np.arange(1, 13).reshape((3, 4))) @@ -6096,6 +6098,7 @@ def test_zarr_region_append(self, tmp_path): @requires_zarr +@pytest.mark.usefixtures("default_zarr_version") def test_zarr_region(tmp_path): x = np.arange(0, 50, 10) y = np.arange(0, 20, 2) @@ -6124,6 +6127,7 @@ def test_zarr_region(tmp_path): @requires_zarr @requires_dask +@pytest.mark.usefixtures("default_zarr_version") def test_zarr_region_chunk_partial(tmp_path): """ Check that writing to partial chunks with `region` fails, assuming `safe_chunks=False`. @@ -6144,6 +6148,7 @@ def test_zarr_region_chunk_partial(tmp_path): @requires_zarr @requires_dask +@pytest.mark.usefixtures("default_zarr_version") def test_zarr_append_chunk_partial(tmp_path): t_coords = np.array([np.datetime64("2020-01-01").astype("datetime64[ns]")]) data = np.ones((10, 10)) @@ -6181,11 +6186,14 @@ def test_zarr_append_chunk_partial(tmp_path): @requires_zarr @requires_dask +@pytest.mark.usefixtures("default_zarr_version") def test_zarr_region_chunk_partial_offset(tmp_path): # https://github.com/pydata/xarray/pull/8459#issuecomment-1819417545 store = tmp_path / "foo.zarr" data = np.ones((30,)) - da = xr.DataArray(data, dims=["x"], coords={"x": range(30)}, name="foo").chunk(x=10) + da = xr.DataArray(data, dims=["x"], coords={"x": range(1, 31)}, name="foo").chunk( + x=10 + ) da.to_zarr(store, compute=False) da.isel(x=slice(10)).chunk(x=(10,)).to_zarr(store, region="auto") @@ -6200,10 +6208,13 @@ def test_zarr_region_chunk_partial_offset(tmp_path): @requires_zarr @requires_dask +@pytest.mark.usefixtures("default_zarr_version") def test_zarr_safe_chunk_append_dim(tmp_path): store = tmp_path / "foo.zarr" data = np.ones((20,)) - da = xr.DataArray(data, dims=["x"], coords={"x": range(20)}, name="foo").chunk(x=5) + da = xr.DataArray(data, dims=["x"], coords={"x": range(1, 21)}, name="foo").chunk( + x=5 + ) da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") with pytest.raises(ValueError): @@ -6250,11 +6261,12 @@ def test_zarr_safe_chunk_append_dim(tmp_path): @requires_zarr @requires_dask +@pytest.mark.usefixtures("default_zarr_version") def test_zarr_safe_chunk_region(tmp_path): store = tmp_path / "foo.zarr" arr = xr.DataArray( - list(range(11)), dims=["a"], coords={"a": list(range(11))}, name="foo" + list(range(1, 12)), dims=["a"], coords={"a": list(range(1, 12))}, name="foo" ).chunk(a=3) arr.to_zarr(store, mode="w") From 7c29ea62aa1ef0386dfd95634ef54285ef3d2eb7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 2 Oct 2024 12:03:23 -0500 Subject: [PATCH 20/74] v3 compat for instrumented test --- xarray/tests/test_backends.py | 98 +++++++++++++++++++++++------------ 1 file changed, 65 insertions(+), 33 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index fb521919a90..184e1b1c63a 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3342,16 +3342,24 @@ def test_append(self) -> None: @requires_dask def test_region_write(self) -> None: - ds = Dataset({"foo": ("x", [1, 2, 3])}, coords={"x": [0, 1, 2]}).chunk() + ds = Dataset({"foo": ("x", [1, 2, 3])}, coords={"x": [1, 2, 3]}).chunk() with self.create_zarr_target() as store: - expected = { - "iter": 2, - "contains": 7, - "setitem": 8, - "getitem": 6, - "listdir": 2, - "list_prefix": 4, - } + if have_zarr_v3: + expected = { + "set": 5, + "get": 10, + "list_dir": 3, + "list_prefix": 0, + } + else: + expected = { + "iter": 2, + "contains": 7, + "setitem": 8, + "getitem": 6, + "listdir": 2, + "list_prefix": 4, + } patches = self.make_patches(store) with patch.multiple(KVStore, **patches): ds.to_zarr(store, mode="w", compute=False) @@ -3359,14 +3367,22 @@ def test_region_write(self) -> None: # v2024.03.0: {'iter': 5, 'contains': 2, 'setitem': 1, 'getitem': 6, 'listdir': 5, 'list_prefix': 0} # 6057128b: {'iter': 4, 'contains': 2, 'setitem': 1, 'getitem': 5, 'listdir': 4, 'list_prefix': 0} - expected = { - "iter": 2, - "contains": 2, - "setitem": 1, - "getitem": 3, - "listdir": 2, - "list_prefix": 0, - } + if have_zarr_v3: + expected = { + "set": 1, + "get": 3, + "list_dir": 2, + "list_prefix": 0, + } + else: + expected = { + "iter": 2, + "contains": 2, + "setitem": 1, + "getitem": 3, + "listdir": 2, + "list_prefix": 0, + } patches = self.make_patches(store) with patch.multiple(KVStore, **patches): ds.to_zarr(store, region={"x": slice(None)}) @@ -3374,27 +3390,43 @@ def test_region_write(self) -> None: # v2024.03.0: {'iter': 6, 'contains': 4, 'setitem': 1, 'getitem': 11, 'listdir': 6, 'list_prefix': 0} # 6057128b: {'iter': 4, 'contains': 2, 'setitem': 1, 'getitem': 7, 'listdir': 4, 'list_prefix': 0} - expected = { - "iter": 2, - "contains": 2, - "setitem": 1, - "getitem": 5, - "listdir": 2, - "list_prefix": 0, - } + if have_zarr_v3: + expected = { + "set": 1, + "get": 5, + "list_dir": 2, + "list_prefix": 0, + } + else: + expected = { + "iter": 2, + "contains": 2, + "setitem": 1, + "getitem": 5, + "listdir": 2, + "list_prefix": 0, + } patches = self.make_patches(store) with patch.multiple(KVStore, **patches): ds.to_zarr(store, region="auto") self.check_requests(expected, patches) - expected = { - "iter": 1, - "contains": 2, - "setitem": 0, - "getitem": 5, - "listdir": 1, - "list_prefix": 0, - } + if have_zarr_v3: + expected = { + "set": 0, + "get": 5, + "list_dir": 1, + "list_prefix": 0, + } + else: + expected = { + "iter": 1, + "contains": 2, + "setitem": 0, + "getitem": 5, + "listdir": 1, + "list_prefix": 0, + } patches = self.make_patches(store) with patch.multiple(KVStore, **patches): with open_dataset(store, engine="zarr") as actual: From 9b3c28853c5b0e224bd5eac90c6975e67de6198b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 8 Oct 2024 11:28:47 -0500 Subject: [PATCH 21/74] Handle zarr_version / zarr_format deprecation --- xarray/backends/api.py | 2 ++ xarray/backends/zarr.py | 48 ++++++++++++++++++++++++++++++++--- xarray/core/dataset.py | 16 +++++++++--- xarray/tests/test_backends.py | 15 +++++++++++ 4 files changed, 74 insertions(+), 7 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 0939bb623d6..cb2ffb30690 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1681,6 +1681,7 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, + zarr_format: int | None = None, write_empty_chunks: bool | None = None, chunkmanager_store_kwargs: dict[str, Any] | None = None, ) -> backends.ZarrStore | Delayed: @@ -1762,6 +1763,7 @@ def to_zarr( safe_chunks=safe_chunks, stacklevel=4, # for Dataset.to_zarr() zarr_version=zarr_version, + zarr_format=zarr_format, write_empty=write_empty_chunks, ) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 8aafb82ad0e..68a4dca259c 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -511,6 +511,7 @@ def open_store( safe_chunks=True, stacklevel=2, zarr_version=None, + zarr_format=None, write_empty: bool | None = None, ): @@ -525,6 +526,7 @@ def open_store( storage_options=storage_options, stacklevel=stacklevel, zarr_version=zarr_version, + zarr_format=zarr_format, ) group_paths = [node for node in _iter_zarr_groups(zarr_group, parent=group)] return { @@ -557,6 +559,7 @@ def open_group( safe_chunks=True, stacklevel=2, zarr_version=None, + zarr_format=None, write_empty: bool | None = None, ): @@ -571,6 +574,7 @@ def open_group( storage_options=storage_options, stacklevel=stacklevel, zarr_version=zarr_version, + zarr_format=zarr_format, ) return cls( @@ -1082,6 +1086,7 @@ def open_zarr( decode_timedelta=None, use_cftime=None, zarr_version=None, + zarr_format=None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, **kwargs, @@ -1171,9 +1176,15 @@ def open_zarr( decode times to ``np.datetime64[ns]`` objects; if this is not possible raise an error. zarr_version : int or None, optional - The desired zarr spec version to target (currently 2 or 3). The default + + .. deprecated:: 2024.9.1 + Use ``zarr_format`` instead. + + zarr_format : int or None, optional + The desired zarr format to target (currently 2 or 3). The default of None will attempt to determine the zarr version from ``store`` when - possible, otherwise defaulting to 2. + possible, otherwise defaulting to the default version used by + the zarr-python library installed. chunked_array_type: str, optional Which chunked array type to coerce this datasets' arrays to. Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEntryPoint` system. @@ -1226,6 +1237,7 @@ def open_zarr( "storage_options": storage_options, "stacklevel": 4, "zarr_version": zarr_version, + "zarr_format": zarr_format, } ds = open_dataset( @@ -1293,6 +1305,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti storage_options=None, stacklevel=3, zarr_version=None, + zarr_format=None, store=None, engine=None, ) -> Dataset: @@ -1309,6 +1322,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti storage_options=storage_options, stacklevel=stacklevel + 1, zarr_version=zarr_version, + zarr_format=zarr_format, ) store_entrypoint = StoreBackendEntrypoint() @@ -1344,6 +1358,7 @@ def open_datatree( storage_options=None, stacklevel=3, zarr_version=None, + # TODO: zarr_format **kwargs, ) -> DataTree: from xarray.core.datatree import DataTree @@ -1459,6 +1474,7 @@ def _get_open_params( storage_options, stacklevel, zarr_version, + zarr_format, ): import zarr @@ -1474,10 +1490,14 @@ def _get_open_params( ) open_kwargs["storage_options"] = storage_options + zarr_format = _handle_zarr_version_or_format( + zarr_version=zarr_version, zarr_format=zarr_format + ) + if _zarr_v3(): - open_kwargs["zarr_format"] = zarr_version + open_kwargs["zarr_format"] = zarr_format else: - open_kwargs["zarr_version"] = zarr_version + open_kwargs["zarr_version"] = zarr_format if chunk_store is not None: open_kwargs["chunk_store"] = chunk_store @@ -1524,4 +1544,24 @@ def _get_open_params( return zarr_group, consolidate_on_close, close_store_on_close +def _handle_zarr_version_or_format( + *, zarr_version: ZarrFormat | None, zarr_format: ZarrFormat | None +) -> ZarrFormat | None: + """handle the deprecated zarr_version kwarg and return zarr_format""" + if ( + zarr_format is not None + and zarr_version is not None + and zarr_format != zarr_version + ): + raise ValueError( + f"zarr_format {zarr_format} does not match zarr_version {zarr_version}, please only set one" + ) + if zarr_version is not None: + warnings.warn( + "zarr_version is deprecated, use zarr_format", FutureWarning, stacklevel=6 + ) + return zarr_version + return zarr_format + + BACKEND_ENTRYPOINTS["zarr"] = ("zarr", ZarrBackendEntrypoint) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d57a6957553..8a7a65ee57d 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2375,6 +2375,7 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, + zarr_format: int | None = None, write_empty_chunks: bool | None = None, chunkmanager_store_kwargs: dict[str, Any] | None = None, ) -> ZarrStore: ... @@ -2397,6 +2398,7 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, + zarr_format: int | None = None, write_empty_chunks: bool | None = None, chunkmanager_store_kwargs: dict[str, Any] | None = None, ) -> Delayed: ... @@ -2417,6 +2419,7 @@ def to_zarr( safe_chunks: bool = True, storage_options: dict[str, str] | None = None, zarr_version: int | None = None, + zarr_format: int | None = None, write_empty_chunks: bool | None = None, chunkmanager_store_kwargs: dict[str, Any] | None = None, ) -> ZarrStore | Delayed: @@ -2527,9 +2530,15 @@ def to_zarr( Any additional parameters for the storage backend (ignored for local paths). zarr_version : int or None, optional - The desired zarr spec version to target (currently 2 or 3). The - default of None will attempt to determine the zarr version from - ``store`` when possible, otherwise defaulting to 2. + + .. deprecated:: 2024.9.1 + Use ``zarr_format`` instead. + + zarr_format : int or None, optional + The desired zarr format to target (currently 2 or 3). The default + of None will attempt to determine the zarr version from ``store`` when + possible, otherwise defaulting to the default version used by + the zarr-python library installed. write_empty_chunks : bool or None, optional If True, all chunks will be stored regardless of their contents. If False, each chunk is compared to the array's fill value @@ -2590,6 +2599,7 @@ def to_zarr( region=region, safe_chunks=safe_chunks, zarr_version=zarr_version, + zarr_format=zarr_format, write_empty_chunks=write_empty_chunks, chunkmanager_store_kwargs=chunkmanager_store_kwargs, ) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a24f7d0891f..8bc079c005b 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3630,6 +3630,21 @@ def test_zarr_storage_options() -> None: assert_identical(ds, ds_a) +@requires_zarr +def test_zarr_version_deprecated() -> None: + ds = create_test_data() + store = KVStore({}) + + with pytest.warns(FutureWarning, match="zarr_version"): + ds.to_zarr(store, zarr_version=2) + + with pytest.warns(FutureWarning, match="zarr_version"): + xr.open_zarr(store, zarr_version=2) + + with pytest.raises(ValueError, match="zarr_format"): + xr.open_zarr(store, zarr_version=2, zarr_format=3) + + @requires_scipy class TestScipyInMemoryData(CFEncodedBase, NetCDF3Only): engine: T_NetcdfEngine = "scipy" From 3717391088ce26c1277a95216cf6364cc1d65908 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Tue, 8 Oct 2024 20:17:11 -0400 Subject: [PATCH 22/74] wip --- xarray/backends/zarr.py | 53 +++++++++++++++++++++++++++-------- xarray/coding/variables.py | 1 + xarray/core/dtypes.py | 9 +++++- xarray/tests/test_backends.py | 3 +- 4 files changed, 53 insertions(+), 13 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 2f7fe8702b1..5aebb754409 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -495,6 +495,7 @@ class ZarrStore(AbstractWritableDataStore): "_safe_chunks", "_write_empty", "_close_store_on_close", + "_use_zarr_fill_value_as_mask" ) @classmethod @@ -513,10 +514,11 @@ def open_store( safe_chunks=True, stacklevel=2, zarr_version=None, + use_zarr_fill_value_as_mask=None, write_empty: bool | None = None, ): - zarr_group, consolidate_on_close, close_store_on_close = _get_open_params( + zarr_group, consolidate_on_close, close_store_on_close, use_zarr_fill_value_as_mask = _get_open_params( store=store, mode=mode, synchronizer=synchronizer, @@ -527,6 +529,7 @@ def open_store( storage_options=storage_options, stacklevel=stacklevel, zarr_version=zarr_version, + use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask ) group_paths = [node for node in _iter_zarr_groups(zarr_group, parent=group)] return { @@ -539,6 +542,7 @@ def open_store( safe_chunks, write_empty, close_store_on_close, + use_zarr_fill_value_as_mask ) for group in group_paths } @@ -559,10 +563,11 @@ def open_group( safe_chunks=True, stacklevel=2, zarr_version=None, + use_zarr_fill_value_as_mask=None, write_empty: bool | None = None, ): - zarr_group, consolidate_on_close, close_store_on_close = _get_open_params( + zarr_group, consolidate_on_close, close_store_on_close, use_zarr_fill_value_as_mask = _get_open_params( store=store, mode=mode, synchronizer=synchronizer, @@ -573,6 +578,7 @@ def open_group( storage_options=storage_options, stacklevel=stacklevel, zarr_version=zarr_version, + use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask ) return cls( @@ -584,6 +590,7 @@ def open_group( safe_chunks, write_empty, close_store_on_close, + use_zarr_fill_value_as_mask ) def __init__( @@ -596,6 +603,7 @@ def __init__( safe_chunks=True, write_empty: bool | None = None, close_store_on_close: bool = False, + use_zarr_fill_value_as_mask=None ): self.zarr_group = zarr_group self._read_only = self.zarr_group.read_only @@ -607,7 +615,8 @@ def __init__( self._write_region = write_region self._safe_chunks = safe_chunks self._write_empty = write_empty - self._close_store_on_close = close_store_on_close + self._close_store_on_close = close_store_on_close, + self._use_zarr_fill_value_as_mask = use_zarr_fill_value_as_mask @property def ds(self): @@ -650,10 +659,11 @@ def open_store_variable(self, name, zarr_array=None): } ) - # _FillValue needs to be in attributes, not encoding, so it will get - # picked up by decode_cf - if zarr_array.fill_value is not None: - attributes["_FillValue"] = zarr_array.fill_value + if self._use_zarr_fill_value_as_mask: + # Setting this attribute triggers CF decoding for missing values + # TODO: it feels a bit hacky to hijack CF decoding for this purpose + if zarr_array.fill_value is not None: + attributes["_FillValue"] = zarr_array.fill_value return Variable(dimensions, data, attributes, encoding) @@ -857,9 +867,12 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No dtype = v.dtype shape = v.shape - fill_value = attrs.pop("_FillValue", None) - if v.encoding == {"_FillValue": None} and fill_value is None: - v.encoding = {} + if self._use_zarr_fill_value_as_mask: + fill_value = attrs.pop("_FillValue", None) + if v.encoding == {"_FillValue": None} and fill_value is None: + v.encoding = {} + else: + fill_value=None zarr_array = None zarr_shape = None @@ -967,6 +980,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No pipeline = encoding.pop("codecs") encoding["codecs"] = pipeline + print(dtype, encoding) zarr_array = self.zarr_group.create( name, shape=shape, @@ -1084,6 +1098,7 @@ def open_zarr( decode_timedelta=None, use_cftime=None, zarr_version=None, + use_zarr_fill_value_as_mask=None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, **kwargs, @@ -1176,6 +1191,11 @@ def open_zarr( The desired zarr spec version to target (currently 2 or 3). The default of None will attempt to determine the zarr version from ``store`` when possible, otherwise defaulting to 2. + use_zarr_fill_value_as_mask : bool, optional + If True, use the zarr Array `fill_value` to mask the data, the same as done + for NetCDF data with `_FillValue` or `missing_value` attributes. If False, + the `fill_value` is ignored and the data are not masked. If None, this defaults + to True for `zarr_version=2` and False for `zarr_version=3`. chunked_array_type: str, optional Which chunked array type to coerce this datasets' arrays to. Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEntryPoint` system. @@ -1247,6 +1267,7 @@ def open_zarr( decode_timedelta=decode_timedelta, use_cftime=use_cftime, zarr_version=zarr_version, + use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask, ) return ds @@ -1297,6 +1318,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti zarr_version=None, store=None, engine=None, + use_zarr_fill_value_as_mask=None ) -> Dataset: filename_or_obj = _normalize_path(filename_or_obj) if not store: @@ -1311,6 +1333,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti storage_options=storage_options, stacklevel=stacklevel + 1, zarr_version=zarr_version, + use_zarr_fill_value_as_mask=None ) store_entrypoint = StoreBackendEntrypoint() @@ -1443,6 +1466,7 @@ def _get_open_params( storage_options, stacklevel, zarr_version, + use_zarr_fill_value_as_mask ): import zarr @@ -1505,7 +1529,14 @@ def _get_open_params( else: zarr_group = zarr.open_group(store, **open_kwargs) close_store_on_close = zarr_group.store is not store - return zarr_group, consolidate_on_close, close_store_on_close + if use_zarr_fill_value_as_mask is None: + if zarr_version == 3: + # for new data, we use a better default + use_zarr_fill_value_as_mask = False + else: + # this was the default for v2 and shold apply to most existing Zarr data + use_zarr_fill_value_as_mask = True + return zarr_group, consolidate_on_close, close_store_on_close, use_zarr_fill_value_as_mask BACKEND_ENTRYPOINTS["zarr"] = ("zarr", ZarrBackendEntrypoint) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 74916886026..4c235b3e42b 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -464,6 +464,7 @@ def decode(self, variable: Variable, name: T_Name = None): dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min else: if "scale_factor" not in attrs and "add_offset" not in attrs: + print(data.dtype) dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype) else: dtype, decoded_fill_value = ( diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index 7464c1e8a89..0d228e17a19 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -61,7 +61,13 @@ def maybe_promote(dtype: np.dtype) -> tuple[np.dtype, Any]: # N.B. these casting rules should match pandas dtype_: np.typing.DTypeLike fill_value: Any - if isdtype(dtype, "real floating"): + print("maybe_promote", dtype) + if np.issubdtype(dtype, np.dtypes.StringDType()): + # for now, we always promote string dtypes to object for consistency with existing behavior + # TODO: refactor this once we have a better way to handle numpy vlen-string dtypes + dtype_ = object + fill_value = np.nan + elif isdtype(dtype, "real floating"): dtype_ = dtype fill_value = np.nan elif np.issubdtype(dtype, np.timedelta64): @@ -196,6 +202,7 @@ def isdtype(dtype, kind: str | tuple[str, ...], xp=None) -> bool: Unlike xp.isdtype(), kind must be a string. """ + print(dtype, kind, xp) # TODO(shoyer): remove this wrapper when Xarray requires # numpy>=2 and pandas extensions arrays are implemented in # Xarray via the array API diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 184e1b1c63a..3cdd5baffd2 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2242,7 +2242,7 @@ def create_store(self): def save(self, dataset, store_target, **kwargs): # type: ignore[override] if have_zarr_v3 and zarr.config.config["default_zarr_version"] == 3: for k, v in dataset.variables.items(): - if v.dtype.kind in ("U", "O", "M", "b"): + if v.dtype.kind in ("M",): pytest.skip(reason=f"Unsupported dtype {v} for variable: {k}") return dataset.to_zarr(store=store_target, **kwargs, **self.version_kwargs) @@ -2265,6 +2265,7 @@ def roundtrip( with self.create_zarr_target() as store_target: self.save(data, store_target, **save_kwargs) with self.open(store_target, **open_kwargs) as ds: + assert False yield ds @pytest.mark.parametrize("consolidated", [False, True, None]) From 8d16bb291da4a10912030cf556b6255de0254e1e Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Tue, 8 Oct 2024 23:16:59 -0400 Subject: [PATCH 23/74] most Zarr tests passing --- xarray/backends/zarr.py | 54 ++++++++++++++++++++++++----------- xarray/coding/variables.py | 14 ++++++++- xarray/conventions.py | 3 ++ xarray/core/dtypes.py | 2 -- xarray/tests/test_backends.py | 8 +++--- 5 files changed, 58 insertions(+), 23 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 5aebb754409..58572f4998c 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -414,6 +414,7 @@ def _validate_datatypes_for_zarr_append(vname, existing_var, new_var): or np.issubdtype(new_var.dtype, np.datetime64) or np.issubdtype(new_var.dtype, np.bool_) or new_var.dtype == object + or (new_var.dtype.kind in ("S", "U") and existing_var.dtype == object) ): # We can skip dtype equality checks under two conditions: (1) if the var to append is # new to the dataset, because in this case there is no existing var to compare it to; @@ -495,7 +496,7 @@ class ZarrStore(AbstractWritableDataStore): "_safe_chunks", "_write_empty", "_close_store_on_close", - "_use_zarr_fill_value_as_mask" + "_use_zarr_fill_value_as_mask", ) @classmethod @@ -518,7 +519,12 @@ def open_store( write_empty: bool | None = None, ): - zarr_group, consolidate_on_close, close_store_on_close, use_zarr_fill_value_as_mask = _get_open_params( + ( + zarr_group, + consolidate_on_close, + close_store_on_close, + use_zarr_fill_value_as_mask, + ) = _get_open_params( store=store, mode=mode, synchronizer=synchronizer, @@ -529,7 +535,7 @@ def open_store( storage_options=storage_options, stacklevel=stacklevel, zarr_version=zarr_version, - use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask + use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask, ) group_paths = [node for node in _iter_zarr_groups(zarr_group, parent=group)] return { @@ -542,7 +548,7 @@ def open_store( safe_chunks, write_empty, close_store_on_close, - use_zarr_fill_value_as_mask + use_zarr_fill_value_as_mask, ) for group in group_paths } @@ -567,7 +573,12 @@ def open_group( write_empty: bool | None = None, ): - zarr_group, consolidate_on_close, close_store_on_close, use_zarr_fill_value_as_mask = _get_open_params( + ( + zarr_group, + consolidate_on_close, + close_store_on_close, + use_zarr_fill_value_as_mask, + ) = _get_open_params( store=store, mode=mode, synchronizer=synchronizer, @@ -578,7 +589,7 @@ def open_group( storage_options=storage_options, stacklevel=stacklevel, zarr_version=zarr_version, - use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask + use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask, ) return cls( @@ -590,7 +601,7 @@ def open_group( safe_chunks, write_empty, close_store_on_close, - use_zarr_fill_value_as_mask + use_zarr_fill_value_as_mask, ) def __init__( @@ -603,7 +614,7 @@ def __init__( safe_chunks=True, write_empty: bool | None = None, close_store_on_close: bool = False, - use_zarr_fill_value_as_mask=None + use_zarr_fill_value_as_mask=None, ): self.zarr_group = zarr_group self._read_only = self.zarr_group.read_only @@ -615,7 +626,7 @@ def __init__( self._write_region = write_region self._safe_chunks = safe_chunks self._write_empty = write_empty - self._close_store_on_close = close_store_on_close, + self._close_store_on_close = (close_store_on_close,) self._use_zarr_fill_value_as_mask = use_zarr_fill_value_as_mask @property @@ -872,7 +883,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No if v.encoding == {"_FillValue": None} and fill_value is None: v.encoding = {} else: - fill_value=None + fill_value = None zarr_array = None zarr_shape = None @@ -980,7 +991,6 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No pipeline = encoding.pop("codecs") encoding["codecs"] = pipeline - print(dtype, encoding) zarr_array = self.zarr_group.create( name, shape=shape, @@ -1318,7 +1328,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti zarr_version=None, store=None, engine=None, - use_zarr_fill_value_as_mask=None + use_zarr_fill_value_as_mask=None, ) -> Dataset: filename_or_obj = _normalize_path(filename_or_obj) if not store: @@ -1333,7 +1343,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti storage_options=storage_options, stacklevel=stacklevel + 1, zarr_version=zarr_version, - use_zarr_fill_value_as_mask=None + use_zarr_fill_value_as_mask=None, ) store_entrypoint = StoreBackendEntrypoint() @@ -1466,7 +1476,7 @@ def _get_open_params( storage_options, stacklevel, zarr_version, - use_zarr_fill_value_as_mask + use_zarr_fill_value_as_mask, ): import zarr @@ -1529,14 +1539,26 @@ def _get_open_params( else: zarr_group = zarr.open_group(store, **open_kwargs) close_store_on_close = zarr_group.store is not store + + # we use this to determine how to handle fill_value + is_zarr_v3_format: bool + if _zarr_v3(): + is_zarr_v3_format = zarr_group.metadata.zarr_format == 3 + else: + is_zarr_v3_format = False if use_zarr_fill_value_as_mask is None: - if zarr_version == 3: + if is_zarr_v3_format: # for new data, we use a better default use_zarr_fill_value_as_mask = False else: # this was the default for v2 and shold apply to most existing Zarr data use_zarr_fill_value_as_mask = True - return zarr_group, consolidate_on_close, close_store_on_close, use_zarr_fill_value_as_mask + return ( + zarr_group, + consolidate_on_close, + close_store_on_close, + use_zarr_fill_value_as_mask, + ) BACKEND_ENTRYPOINTS["zarr"] = ("zarr", ZarrBackendEntrypoint) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 4c235b3e42b..3fbc175d3e3 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -464,7 +464,6 @@ def decode(self, variable: Variable, name: T_Name = None): dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min else: if "scale_factor" not in attrs and "add_offset" not in attrs: - print(data.dtype) dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype) else: dtype, decoded_fill_value = ( @@ -708,6 +707,19 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: return variable +class Numpy2StringDTypeCoder(VariableCoder): + # Convert Numpy 2 StringDType arrays to object arrays for backwards compatibility + # TODO: remove this if / when we decide to allow StringDType arrays in Xarray + def encode(self): + raise NotImplementedError + + def decode(self, variable: Variable, name: T_Name = None) -> Variable: + if variable.dtype.kind == "T": + return variable.astype(object) + else: + return variable + + class NativeEnumCoder(VariableCoder): """Encode Enum into variable dtype metadata.""" diff --git a/xarray/conventions.py b/xarray/conventions.py index 18a81938225..9f719b8497c 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -276,6 +276,9 @@ def decode_cf_variable( var = variables.ObjectVLenStringCoder().decode(var) original_dtype = var.dtype + if original_dtype.kind == "T": + var = variables.Numpy2StringDTypeCoder().decode(var) + if mask_and_scale: for coder in [ variables.CFMaskCoder(), diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index 0d228e17a19..985eb3fa83f 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -61,7 +61,6 @@ def maybe_promote(dtype: np.dtype) -> tuple[np.dtype, Any]: # N.B. these casting rules should match pandas dtype_: np.typing.DTypeLike fill_value: Any - print("maybe_promote", dtype) if np.issubdtype(dtype, np.dtypes.StringDType()): # for now, we always promote string dtypes to object for consistency with existing behavior # TODO: refactor this once we have a better way to handle numpy vlen-string dtypes @@ -202,7 +201,6 @@ def isdtype(dtype, kind: str | tuple[str, ...], xp=None) -> bool: Unlike xp.isdtype(), kind must be a string. """ - print(dtype, kind, xp) # TODO(shoyer): remove this wrapper when Xarray requires # numpy>=2 and pandas extensions arrays are implemented in # Xarray via the array API diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3cdd5baffd2..7f3e00d872f 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -558,6 +558,7 @@ def test_roundtrip_object_dtype(self) -> None: # This currently includes all netCDF files when encoding is not # explicitly set. # https://github.com/pydata/xarray/issues/1647 + # Also Zarr expected["bytes_nans"][-1] = b"" expected["strings_nans"][-1] = "" assert_identical(expected, actual) @@ -2265,7 +2266,6 @@ def roundtrip( with self.create_zarr_target() as store_target: self.save(data, store_target, **save_kwargs) with self.open(store_target, **open_kwargs) as ds: - assert False yield ds @pytest.mark.parametrize("consolidated", [False, True, None]) @@ -2745,12 +2745,12 @@ def test_check_encoding_is_consistent_after_append(self) -> None: with self.create_zarr_target() as store_target: import numcodecs - compressor = numcodecs.Blosc() - if have_zarr_v3 and zarr.config.config["default_zarr_version"] == 3: + compressor = zarr.codecs.BloscCodec() encoding_key = "codecs" - encoding_value = [compressor] + encoding_value = [zarr.codecs.BytesCodec(), compressor] else: + compressor = numcodecs.Blosc() encoding_key = "compressor" encoding_value = compressor From bd978b04827b858a106a18b19e311caf1cf71abb Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Tue, 8 Oct 2024 23:29:18 -0400 Subject: [PATCH 24/74] unskip tests --- xarray/tests/test_backends.py | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 7f3e00d872f..990fc840b23 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3164,38 +3164,6 @@ def test_chunked_cftime_datetime(self) -> None: assert original[name].chunks == actual_var.chunks assert original.chunks == actual.chunks - def test_write_store(self) -> None: - skip_if_zarr_format_3(reason="unsupported dtypes") - return super().test_write_store() - - def test_roundtrip_endian(self) -> None: - skip_if_zarr_format_3(reason="unsupported dtypes") - return super().test_roundtrip_endian() - - def test_roundtrip_bytes_with_fill_value(self) -> None: - skip_if_zarr_format_3(reason="unsupported dtypes") - return super().test_roundtrip_bytes_with_fill_value() - - def test_default_fill_value(self) -> None: - skip_if_zarr_format_3(reason="fill_value always written") - return super().test_default_fill_value() - - def test_explicitly_omit_fill_value(self) -> None: - skip_if_zarr_format_3(reason="fill_value always written") - return super().test_explicitly_omit_fill_value() - - def test_explicitly_omit_fill_value_via_encoding_kwarg(self) -> None: - skip_if_zarr_format_3(reason="fill_value always written") - return super().test_explicitly_omit_fill_value_via_encoding_kwarg() - - def test_explicitly_omit_fill_value_in_coord(self) -> None: - skip_if_zarr_format_3(reason="fill_value always written") - return super().test_explicitly_omit_fill_value_in_coord() - - def test_explicitly_omit_fill_value_in_coord_via_encoding_kwarg(self) -> None: - skip_if_zarr_format_3(reason="fill_value always written") - return super().test_explicitly_omit_fill_value_in_coord_via_encoding_kwarg() - @requires_zarr @pytest.mark.skipif(not have_zarr_v3, reason="requires zarr version 3") From 34c4c2430f9f0e3559b47f52d6077c536cb2386a Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Wed, 9 Oct 2024 10:54:46 -0400 Subject: [PATCH 25/74] add custom Zarr _FillValue encoding / decoding --- xarray/backends/zarr.py | 59 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 58572f4998c..f3d122ad394 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1,8 +1,10 @@ from __future__ import annotations +import base64 import functools import json import os +import struct import warnings from collections.abc import Callable, Iterable from typing import TYPE_CHECKING, Any, Literal @@ -58,6 +60,49 @@ def _zarr_v3() -> bool: ZarrFormat = Literal[2, 3] +class FillValueCoder: + """Handle custom logic to safely encode and decode fill values in Zarr. + Possibly redundant with logic in xarray/coding/variables.py but needs to be + isolated from NetCDF-specific logic. + """ + + @classmethod + def encode(cls, value: int | float | str | bytes, dtype: np.dtype[Any]) -> Any: + if dtype.kind in "S": + # byte string + return base64.standard_b64encode(value).decode() + elif dtype.kind in "b": + # boolean + return bool(value) + elif dtype.kind in "iu": + # todo: do we want to check for decimals? + return int(value) + elif dtype.kind in "f": + return base64.standard_b64encode(struct.pack(" Date: Wed, 9 Oct 2024 15:03:23 -0400 Subject: [PATCH 26/74] relax dtype comparison in test_roundtrip_empty_vlen_string_array --- xarray/tests/test_backends.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 990fc840b23..306d8e27d6a 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -924,7 +924,8 @@ def test_roundtrip_empty_vlen_string_array(self) -> None: if actual["a"].dtype.metadata is not None: assert check_vlen_dtype(actual["a"].dtype) is str else: - assert actual["a"].dtype == np.dtype("=U1") + # zarr v3 sends back " Date: Wed, 9 Oct 2024 19:28:00 -0400 Subject: [PATCH 27/74] fix test_explicitly_omit_fill_value_via_encoding_kwarg --- xarray/backends/zarr.py | 18 +++++++++++++----- xarray/tests/test_backends.py | 2 ++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 3917d460e17..9f9238f9cba 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -445,6 +445,7 @@ def encode_zarr_variable(var, needs_copy=True, name=None): # zarr allows unicode, but not variable-length strings, so it's both # simpler and more compact to always encode as UTF-8 explicitly. # TODO: allow toggling this explicitly via dtype in encoding. + # TODO: revisit this now that Zarr _does_ allow variable-length strings coder = coding.strings.EncodedStringCoder(allows_unicode=True) var = coder.encode(var, name=name) var = coding.strings.ensure_fixed_length_bytes(var) @@ -936,15 +937,22 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No if self._use_zarr_fill_value_as_mask: fill_value = attrs.pop("_FillValue", None) - if v.encoding == {"_FillValue": None} and fill_value is None: - v.encoding = {} else: fill_value = None if "_FillValue" in attrs: # replace with encoded fill value - attrs["_FillValue"] = FillValueCoder.encode( - attrs["_FillValue"], dtype - ) + fv = attrs.pop("_FillValue") + if fv is not None: + attrs["_FillValue"] = FillValueCoder.encode(fv, dtype) + + # _FillValue is never a valid encoding for Zarr + # TODO: refactor this logic so we don't need to check this here + if "_FillValue" in v.encoding: + efv = v.encoding["_FillValue"] + if efv is not None: + raise ValueError("Zarr does not support _FillValue in encoding.") + else: + del v.encoding["_FillValue"] zarr_array = None zarr_shape = None diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 17d9ac7d6d0..8d5cf99c106 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1333,6 +1333,8 @@ def test_explicitly_omit_fill_value(self) -> None: with self.roundtrip(ds) as actual: assert "_FillValue" not in actual.x.encoding + # TODO: decide if this test is really necessary + # _FillValue is not a valid encoding for Zarr def test_explicitly_omit_fill_value_via_encoding_kwarg(self) -> None: ds = Dataset({"x": ("y", [np.pi, -np.pi])}) kwargs = dict(encoding={"x": {"_FillValue": None}}) From a330e4b9b1a8ffdca4827b870932ecfce3ed976d Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Wed, 9 Oct 2024 19:51:11 -0400 Subject: [PATCH 28/74] fix test_append_string_length_mismatch_raises --- xarray/tests/test_backends.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 17d9ac7d6d0..7864f8d1e81 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -148,6 +148,11 @@ def skip_if_zarr_format_3(reason: str): pytest.xfail(reason=f"Unsupported with zarr_format=3: {reason}") +def skip_if_zarr_format_2(reason: str): + if have_zarr_v3 and zarr.config["default_zarr_version"] == 2: + pytest.xfail(reason=f"Unsupported with zarr_format=2: {reason}") + + ON_WINDOWS = sys.platform == "win32" default_value = object() dask_array_type = array_type("dask") @@ -2761,6 +2766,7 @@ def test_append_with_existing_encoding_raises(self) -> None: @pytest.mark.parametrize("dtype", ["U", "S"]) def test_append_string_length_mismatch_raises(self, dtype) -> None: + skip_if_zarr_format_3("This actually works fine with Zarr format 3") ds, ds_to_append = create_append_string_length_mismatch_test_data(dtype) with self.create_zarr_target() as store_target: ds.to_zarr(store_target, mode="w", **self.version_kwargs) @@ -2769,6 +2775,18 @@ def test_append_string_length_mismatch_raises(self, dtype) -> None: store_target, append_dim="time", **self.version_kwargs ) + @pytest.mark.parametrize("dtype", ["U", "S"]) + def test_append_string_length_mismatch_works(self, dtype) -> None: + skip_if_zarr_format_2("This doesn't work with Zarr format 2") + # ...but it probably would if we used object dtype + ds, ds_to_append = create_append_string_length_mismatch_test_data(dtype) + expected = xr.concat([ds, ds_to_append], dim="time") + with self.create_zarr_target() as store_target: + ds.to_zarr(store_target, mode="w", **self.version_kwargs) + ds_to_append.to_zarr(store_target, append_dim="time", **self.version_kwargs) + actual = xr.open_dataset(store_target, engine="zarr") + xr.testing.assert_identical(expected, actual) + def test_check_encoding_is_consistent_after_append(self) -> None: ds, ds_to_append, _ = create_append_test_data() From bde42ee3164fe355b8483d1ead72f033d1318315 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Wed, 9 Oct 2024 20:21:52 -0400 Subject: [PATCH 29/74] fix test_check_encoding_is_consistent_after_append for v3 --- xarray/tests/test_backends.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 17d9ac7d6d0..30c647d4d8a 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2787,12 +2787,18 @@ def test_check_encoding_is_consistent_after_append(self) -> None: encoding = {"da": {encoding_key: encoding_value}} ds.to_zarr(store_target, mode="w", encoding=encoding, **self.version_kwargs) + original_ds = xr.open_dataset( + store_target, engine="zarr", **self.version_kwargs + ) + original_encoding = original_ds["da"].encoding[encoding_key] ds_to_append.to_zarr(store_target, append_dim="time", **self.version_kwargs) actual_ds = xr.open_dataset( store_target, engine="zarr", **self.version_kwargs ) + # assert actual_encoding.get_config() == compressor.get_config() + # TODO: check whether this approach works for v2 actual_encoding = actual_ds["da"].encoding[encoding_key] - assert actual_encoding.get_config() == compressor.get_config() + assert original_encoding == actual_encoding assert_identical( xr.open_dataset( store_target, engine="zarr", **self.version_kwargs From b15705d166f513023855b59f2e7ceed9505398ca Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Wed, 9 Oct 2024 20:37:55 -0400 Subject: [PATCH 30/74] skip roundtrip_endian for zarr v3 --- xarray/tests/test_backends.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 17d9ac7d6d0..9740d8cc2d0 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1224,6 +1224,7 @@ def equals_latlon(obj): assert "coordinates" not in ds["lon"].encoding def test_roundtrip_endian(self) -> None: + skip_if_zarr_format_3("zarr v3 has not implemented endian support yet") ds = Dataset( { "x": np.arange(3, 10, dtype=">i2"), From 38f43b9156005f963b8d00d436fe8b909c8fee85 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Wed, 9 Oct 2024 21:45:07 -0400 Subject: [PATCH 31/74] unskip datetimes and fix test_compressor_encoding --- xarray/tests/test_backends.py | 38 +++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 17d9ac7d6d0..50f2c6f7330 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2272,11 +2272,6 @@ def create_store(self): ) def save(self, dataset, store_target, **kwargs): # type: ignore[override] - if have_zarr_v3 and zarr.config.config["default_zarr_version"] == 3: - for k, v in dataset.variables.items(): - if v.dtype.kind in ("M",): - pytest.skip(reason=f"Unsupported dtype {v} for variable: {k}") - return dataset.to_zarr(store=store_target, **kwargs, **self.version_kwargs) @contextlib.contextmanager @@ -2638,21 +2633,38 @@ def test_write_persistence_modes(self, group) -> None: def test_compressor_encoding(self) -> None: original = create_test_data() # specify a custom compressor - from numcodecs.blosc import Blosc - - blosc_comp = Blosc(cname="zstd", clevel=3, shuffle=2) if have_zarr_v3 and zarr.config.config["default_zarr_version"] == 3: - encoding = {"codecs": [blosc_comp]} + encoding_key = "codecs" + # all parameters need to be explicitly specified in order for the comparison to pass below + encoding = { + encoding_key: ( + zarr.codecs.BytesCodec(endian="little"), + zarr.codecs.BloscCodec( + cname="zstd", + clevel=3, + shuffle="shuffle", + typesize=8, + blocksize=0, + ), + ) + } else: - encoding = {"compressor": blosc_comp} + from numcodecs.blosc import Blosc + + encoding_key = "compressor" + encoding = {encoding_key: Blosc(cname="zstd", clevel=3, shuffle=2)} save_kwargs = dict(encoding={"var1": encoding}) with self.roundtrip(original, save_kwargs=save_kwargs) as ds: - actual = ds["var1"].encoding["compressor"] - # get_config returns a dictionary of compressor attributes - assert actual.get_config() == blosc_comp.get_config() + enc = ds["var1"].encoding[encoding_key] + if have_zarr_v3 and zarr.config.config["default_zarr_version"] == 3: + # TODO: figure out a cleaner way to do this comparison + codecs = zarr.core.metadata.v3.parse_codecs(enc) + assert codecs == encoding[encoding_key] + else: + assert enc == encoding[encoding_key] def test_group(self) -> None: original = create_test_data() From 1cfc458dec5cf57b82594fc27b7925b14da1319d Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Wed, 9 Oct 2024 21:19:04 -0400 Subject: [PATCH 32/74] unskip tests --- xarray/tests/test_backends.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 17d9ac7d6d0..be607685f4c 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2272,11 +2272,6 @@ def create_store(self): ) def save(self, dataset, store_target, **kwargs): # type: ignore[override] - if have_zarr_v3 and zarr.config.config["default_zarr_version"] == 3: - for k, v in dataset.variables.items(): - if v.dtype.kind in ("M",): - pytest.skip(reason=f"Unsupported dtype {v} for variable: {k}") - return dataset.to_zarr(store=store_target, **kwargs, **self.version_kwargs) @contextlib.contextmanager @@ -5353,13 +5348,13 @@ def test_dataarray_to_netcdf_no_name_pathlib(self) -> None: class TestDataArrayToZarr: def skip_if_zarr_python_3_and_zip_store(self, store) -> None: - if isinstance(store, zarr.storage.ZipStore): + if isinstance(store, zarr.storage.zip.ZipStore): pytest.skip( reason="zarr-python 3.x doesn't support reopening ZipStore with a new mode." ) def test_dataarray_to_zarr_no_name(self, tmp_store) -> None: - skip_if_zarr_format_3(tmp_store) + self.skip_if_zarr_python_3_and_zip_store(tmp_store) original_da = DataArray(np.arange(1, 13).reshape((3, 4))) original_da.to_zarr(tmp_store) @@ -5368,7 +5363,7 @@ def test_dataarray_to_zarr_no_name(self, tmp_store) -> None: assert_identical(original_da, loaded_da) def test_dataarray_to_zarr_with_name(self, tmp_store) -> None: - skip_if_zarr_format_3(tmp_store) + self.skip_if_zarr_python_3_and_zip_store(tmp_store) original_da = DataArray(np.arange(12).reshape((3, 4)) + 1, name="test") original_da.to_zarr(tmp_store) @@ -5377,7 +5372,7 @@ def test_dataarray_to_zarr_with_name(self, tmp_store) -> None: assert_identical(original_da, loaded_da) def test_dataarray_to_zarr_coord_name_clash(self, tmp_store) -> None: - skip_if_zarr_format_3(tmp_store) + self.skip_if_zarr_python_3_and_zip_store(tmp_store) original_da = DataArray( np.arange(12).reshape((3, 4)) + 1, dims=["x", "y"], name="x" ) @@ -5388,7 +5383,7 @@ def test_dataarray_to_zarr_coord_name_clash(self, tmp_store) -> None: assert_identical(original_da, loaded_da) def test_open_dataarray_options(self, tmp_store) -> None: - skip_if_zarr_format_3(tmp_store) + self.skip_if_zarr_python_3_and_zip_store(tmp_store) data = DataArray(np.arange(5) + 1, coords={"y": ("x", range(1, 6))}, dims=["x"]) data.to_zarr(tmp_store) From af1a0b8d3e611ecfedba7af8007c9f4e2dd4f785 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Wed, 9 Oct 2024 21:56:00 -0400 Subject: [PATCH 33/74] add back dtype skip --- xarray/tests/test_backends.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index be607685f4c..d1a167c7631 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2272,6 +2272,11 @@ def create_store(self): ) def save(self, dataset, store_target, **kwargs): # type: ignore[override] + if have_zarr_v3 and zarr.config.config["default_zarr_version"] == 3: + for k, v in dataset.variables.items(): + if v.dtype.kind in ("M",): + pytest.skip(reason=f"Unsupported dtype {v} for variable: {k}") + return dataset.to_zarr(store=store_target, **kwargs, **self.version_kwargs) @contextlib.contextmanager From 0c2e260f7f7f2d9055047d967135e35cd1a6ca69 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 10 Oct 2024 15:09:48 -0500 Subject: [PATCH 34/74] point upstream to v3 branch --- ci/install-upstream-wheels.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/install-upstream-wheels.sh b/ci/install-upstream-wheels.sh index af8a21c1dbb..f96106869bd 100755 --- a/ci/install-upstream-wheels.sh +++ b/ci/install-upstream-wheels.sh @@ -53,7 +53,7 @@ python -m pip install \ git+https://github.com/dask/dask \ git+https://github.com/dask/dask-expr \ git+https://github.com/dask/distributed \ - git+https://github.com/zarr-developers/zarr.git@main \ + git+https://github.com/zarr-developers/zarr.git@v3 \ git+https://github.com/Unidata/cftime \ git+https://github.com/pypa/packaging \ git+https://github.com/hgrecco/pint \ From 0e47c3fd2fcc24b3b9d7c1d1f924487e9b99391a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 11 Oct 2024 09:55:37 -0500 Subject: [PATCH 35/74] Create temporary directory before using it --- xarray/tests/test_backends.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 2dedca25610..21c4624e5bf 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1381,6 +1381,9 @@ def test_append_overwrite_values(self) -> None: # regression for GH1215 data = create_test_data() with create_tmp_file(allow_cleanup_failure=False) as tmp_file: + # Whose responsibility is it to create the file? + Path(tmp_file).mkdir(parents=True, exist_ok=True) + self.save(data, tmp_file, mode="w") data["var2"][:] = -999 data["var9"] = data["var2"] * 3 From 5b39f422eb4247b8d6b8c84f4f6ce6f28fbb9962 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 11 Oct 2024 11:03:39 -0500 Subject: [PATCH 36/74] Avoid zarr.storage.zip on v2 --- xarray/tests/test_backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 21c4624e5bf..d295af23a49 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5396,7 +5396,7 @@ def test_dataarray_to_netcdf_no_name_pathlib(self) -> None: class TestDataArrayToZarr: def skip_if_zarr_python_3_and_zip_store(self, store) -> None: - if isinstance(store, zarr.storage.zip.ZipStore): + if have_zarr_v3 and isinstance(store, zarr.storage.zip.ZipStore): pytest.skip( reason="zarr-python 3.x doesn't support reopening ZipStore with a new mode." ) From 7d9fc050ed059e5b8de12bfff8da7eb6e0f4ff6a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 11 Oct 2024 11:26:49 -0500 Subject: [PATCH 37/74] fixed close_store_on_close bug --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 9f9238f9cba..f3e706756b3 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -678,7 +678,7 @@ def __init__( self._write_region = write_region self._safe_chunks = safe_chunks self._write_empty = write_empty - self._close_store_on_close = (close_store_on_close,) + self._close_store_on_close = close_store_on_close self._use_zarr_fill_value_as_mask = use_zarr_fill_value_as_mask @property From 0fa94eeedaa4cbbae5e7d56f5cb1af07298d7b91 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 11 Oct 2024 13:09:23 -0500 Subject: [PATCH 38/74] Remove workaround, fixed upstream --- xarray/tests/test_backends.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index d295af23a49..20504194819 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1381,9 +1381,6 @@ def test_append_overwrite_values(self) -> None: # regression for GH1215 data = create_test_data() with create_tmp_file(allow_cleanup_failure=False) as tmp_file: - # Whose responsibility is it to create the file? - Path(tmp_file).mkdir(parents=True, exist_ok=True) - self.save(data, tmp_file, mode="w") data["var2"][:] = -999 data["var9"] = data["var2"] * 3 From c2fd6f1f8c7505a15d2590a1f6f030e7a630f4ec Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 11 Oct 2024 13:09:35 -0500 Subject: [PATCH 39/74] Restore original `w` mode. --- xarray/tests/test_backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 20504194819..870e30c5e01 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2277,7 +2277,7 @@ def create_zarr_target(self): def create_store(self): with self.create_zarr_target() as store_target: yield backends.ZarrStore.open_group( - store_target, mode="a", **self.version_kwargs + store_target, mode="w", **self.version_kwargs ) def save(self, dataset, store_target, **kwargs): # type: ignore[override] From ac2ef29894eb672be30e3bdc7d1c5ea243b3fd02 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 11 Oct 2024 13:41:22 -0500 Subject: [PATCH 40/74] workaround for store closing with mode=w --- xarray/tests/test_backends.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 870e30c5e01..402812bc78a 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2582,6 +2582,10 @@ def test_hidden_zarr_keys(self) -> None: for var in expected.variables.keys(): assert self.DIMENSION_KEY not in expected[var].attrs + if have_zarr_v3: + # temporary workaround for https://github.com/zarr-developers/zarr-python/issues/2338 + zarr_group.store._is_open = True + # put it back and try removing from a variable del zarr_group["var2"].attrs[self.DIMENSION_KEY] with pytest.raises(KeyError): From c6be4677c2f5989b66241500c1b1121e4c2837f1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 11 Oct 2024 13:50:13 -0500 Subject: [PATCH 41/74] typing fixes --- xarray/backends/zarr.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index f3e706756b3..6db3c508cd9 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -69,7 +69,8 @@ class FillValueCoder: @classmethod def encode(cls, value: int | float | str | bytes, dtype: np.dtype[Any]) -> Any: if dtype.kind in "S": - # byte string + # byte string, this implies that 'value' must also be `bytes` dtype. + assert isinstance(value, bytes) return base64.standard_b64encode(value).decode() elif dtype.kind in "b": # boolean @@ -1606,7 +1607,7 @@ def _get_open_params( consolidated = False if _zarr_v3(): - missing_exc: type[Exception] = ValueError + missing_exc = ValueError else: missing_exc = zarr.errors.GroupNotFoundError From 5b5b77fcaf5dfced3eee46b281c8feb2c3c08cd4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 11 Oct 2024 13:53:28 -0500 Subject: [PATCH 42/74] compat --- xarray/core/dtypes.py | 3 ++- xarray/core/npcompat.py | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/xarray/core/dtypes.py b/xarray/core/dtypes.py index 985eb3fa83f..e7ac408112a 100644 --- a/xarray/core/dtypes.py +++ b/xarray/core/dtypes.py @@ -7,6 +7,7 @@ from pandas.api.types import is_extension_array_dtype from xarray.core import array_api_compat, npcompat, utils +from xarray.core.npcompat import HAS_STRING_DTYPE # Use as a sentinel value to indicate a dtype appropriate NA value. NA = utils.ReprObject("") @@ -61,7 +62,7 @@ def maybe_promote(dtype: np.dtype) -> tuple[np.dtype, Any]: # N.B. these casting rules should match pandas dtype_: np.typing.DTypeLike fill_value: Any - if np.issubdtype(dtype, np.dtypes.StringDType()): + if HAS_STRING_DTYPE and np.issubdtype(dtype, np.dtypes.StringDType()): # type: ignore[attr-defined] # for now, we always promote string dtypes to object for consistency with existing behavior # TODO: refactor this once we have a better way to handle numpy vlen-string dtypes dtype_ = object diff --git a/xarray/core/npcompat.py b/xarray/core/npcompat.py index 92d30e1d31b..571ebbbc39b 100644 --- a/xarray/core/npcompat.py +++ b/xarray/core/npcompat.py @@ -35,6 +35,8 @@ try: # requires numpy>=2.0 from numpy import isdtype # type: ignore[attr-defined,unused-ignore] + + HAS_STRING_DTYPE = True except ImportError: import numpy as np from numpy.typing import DTypeLike @@ -71,3 +73,5 @@ def isdtype( return isinstance(dtype, translated_kinds) else: return any(np.issubdtype(dtype, k) for k in translated_kinds) + + HAS_STRING_DTYPE = False From 4f07eb7afa165015705de62a463af73269a99813 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 11 Oct 2024 14:41:22 -0500 Subject: [PATCH 43/74] Remove unnecessary pop --- xarray/backends/zarr.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 6db3c508cd9..a612e42d388 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1057,10 +1057,6 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No else: encoding["write_empty_chunks"] = self._write_empty - if "codecs" in encoding: - pipeline = encoding.pop("codecs") - encoding["codecs"] = pipeline - zarr_array = self.zarr_group.create( name, shape=shape, From 5151bc26a3f36b6577abd45d5511ea4bdeb6186d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 11 Oct 2024 14:44:48 -0500 Subject: [PATCH 44/74] fixed skip --- xarray/tests/test_backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 402812bc78a..27d594b4454 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -149,7 +149,7 @@ def skip_if_zarr_format_3(reason: str): def skip_if_zarr_format_2(reason: str): - if have_zarr_v3 and zarr.config["default_zarr_version"] == 2: + if not have_zarr_v3 or (zarr.config["default_zarr_version"] == 2): pytest.xfail(reason=f"Unsupported with zarr_format=2: {reason}") From 00c62d784e7ef36690aeec453e1826299ee2505a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 11 Oct 2024 14:52:37 -0500 Subject: [PATCH 45/74] fixup types --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index a612e42d388..e7fa825e76f 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -92,6 +92,7 @@ def decode(cls, value: int | float | str | bytes, dtype: str | np.dtype[Any]): return str(value) elif dtype == "bytes": # zarr V3 bytes type + assert isinstance(value, str | bytes) return base64.standard_b64decode(value) np_dtype = np.dtype(dtype) if np_dtype.kind in "f": @@ -1641,7 +1642,6 @@ def _get_open_params( close_store_on_close = zarr_group.store is not store # we use this to determine how to handle fill_value - is_zarr_v3_format: bool if _zarr_v3(): is_zarr_v3_format = zarr_group.metadata.zarr_format == 3 else: From e0390a5f95602d62d9b8f5e7038d804e40e8b21c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 11 Oct 2024 15:14:39 -0500 Subject: [PATCH 46/74] fixup types --- xarray/backends/zarr.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index e7fa825e76f..654d0832c56 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -96,6 +96,7 @@ def decode(cls, value: int | float | str | bytes, dtype: str | np.dtype[Any]): return base64.standard_b64decode(value) np_dtype = np.dtype(dtype) if np_dtype.kind in "f": + assert isinstance(value, str | bytes) return struct.unpack(" Date: Sat, 12 Oct 2024 10:45:34 -0500 Subject: [PATCH 47/74] [test-upstream] From 26081d4f950f39f7bbc24cea264d5aff24db730f Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Sat, 12 Oct 2024 15:58:02 -0700 Subject: [PATCH 48/74] Update install-upstream-wheels.sh --- ci/install-upstream-wheels.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/install-upstream-wheels.sh b/ci/install-upstream-wheels.sh index f96106869bd..e9e2d46e460 100755 --- a/ci/install-upstream-wheels.sh +++ b/ci/install-upstream-wheels.sh @@ -45,8 +45,8 @@ python -m pip install \ --pre \ --upgrade \ pyarrow -# manually install `pint` to pull in new dependencies -python -m pip install --upgrade pint +# manually install `pint`, `donfig`, and `crc32c` to pull in new dependencies +python -m pip install --upgrade pint donfig crc32c python -m pip install \ --no-deps \ --upgrade \ From 035005668b38ba11fa373a0c3a1071d7663ee728 Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Sun, 13 Oct 2024 12:19:53 -0700 Subject: [PATCH 49/74] set use_consolidated to false when user provides consolidated=False --- xarray/backends/zarr.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 654d0832c56..13bf7e7ac88 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1639,6 +1639,10 @@ def _get_open_params( # TODO: an option to pass the metadata_key keyword zarr_group = zarr.open_consolidated(store, **open_kwargs) else: + if _zarr_v3(): + # we have determined that we don't want to use consolidated metadata + # so we set that to False to avoid trying to read it + open_kwargs["use_consolidated"] = False zarr_group = zarr.open_group(store, **open_kwargs) close_store_on_close = zarr_group.store is not store From a38bff6c39bcda4403b2818e14b953a5cdf3fe11 Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Mon, 14 Oct 2024 06:37:23 -0700 Subject: [PATCH 50/74] fix: import consolidated_metadata from package root --- xarray/core/datatree_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/datatree_io.py b/xarray/core/datatree_io.py index 36665a0d153..c54c8be6d48 100644 --- a/xarray/core/datatree_io.py +++ b/xarray/core/datatree_io.py @@ -129,7 +129,7 @@ def _datatree_to_zarr( See `DataTree.to_zarr` for full API docs. """ - from zarr.convenience import consolidate_metadata + from zarr import consolidate_metadata if group is not None: raise NotImplementedError( From 08f05943b50cde21628a2ee17a435b46522705d2 Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Mon, 14 Oct 2024 06:41:18 -0700 Subject: [PATCH 51/74] fix: relax instrumented store checks for v3 --- xarray/tests/test_backends.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 27d594b4454..4041e1a20b5 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3241,7 +3241,6 @@ def test_chunked_cftime_datetime(self) -> None: @requires_zarr -@pytest.mark.skipif(not have_zarr_v3, reason="requires zarr version 3") class TestInstrumentedZarrStore: if have_zarr_v3: @@ -3338,8 +3337,8 @@ def test_append(self) -> None: if have_zarr_v3: expected = { "set": 10, - "get": 8, - "list_dir": 1, + "get": 16, # TODO: fixme upstream (should be 8) + "list_dir": 3, # TODO: fixme upstream (should be 2) "list_prefix": 0, } else: @@ -3361,8 +3360,8 @@ def test_append(self) -> None: if have_zarr_v3: expected = { "set": 10, - "get": 8, - "list_dir": 2, + "get": 16, # TODO: fixme upstream (should be 8) + "list_dir": 3, # TODO: fixme upstream (should be 2) "list_prefix": 0, } else: From f2f9fff03bf9bc1acb21c2bebc532e01d2cb0e02 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 14 Oct 2024 13:50:12 -0500 Subject: [PATCH 52/74] Adjust 2.18.3 thresholds --- xarray/tests/test_backends.py | 64 ++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 27 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 4041e1a20b5..c64e7a36da7 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -124,13 +124,16 @@ ZARR_FORMATS = [2, 3] else: - from zarr.storage import KVStore - ZARR_FORMATS = [2] + try: + from zarr import KVStoreV3 as KVStore + except ImportError: + KVStore = None else: have_zarr_v3 = False have_zarr_kvstore = False have_zarr_directory_store = False + KVStore = None ZARR_FORMATS = [] @@ -3241,6 +3244,9 @@ def test_chunked_cftime_datetime(self) -> None: @requires_zarr +@pytest.mark.skipif( + KVStore is None, reason="zarr-python 2.x or ZARR_V3_EXPERIMENTAL_API is unset." +) class TestInstrumentedZarrStore: if have_zarr_v3: @@ -3318,10 +3324,10 @@ def test_append(self) -> None: } else: expected = { - "iter": 2, - "contains": 9, - "setitem": 9, - "getitem": 6, + "iter": 3, + "contains": 18, + "setitem": 10, + "getitem": 13, "listdir": 2, "list_prefix": 2, } @@ -3343,10 +3349,10 @@ def test_append(self) -> None: } else: expected = { - "iter": 2, - "contains": 2, - "setitem": 5, - "getitem": 6, + "iter": 3, + "contains": 9, + "setitem": 6, + "getitem": 13, "listdir": 2, "list_prefix": 0, } @@ -3366,10 +3372,10 @@ def test_append(self) -> None: } else: expected = { - "iter": 2, - "contains": 2, - "setitem": 5, - "getitem": 6, + "iter": 3, + "contains": 9, + "setitem": 6, + "getitem": 13, "listdir": 2, "list_prefix": 0, } @@ -3396,13 +3402,14 @@ def test_region_write(self) -> None: } else: expected = { - "iter": 2, - "contains": 7, - "setitem": 8, - "getitem": 6, + "iter": 3, + "contains": 16, + "setitem": 9, + "getitem": 13, "listdir": 2, "list_prefix": 4, } + patches = self.make_patches(store) with patch.multiple(KVStore, **patches): ds.to_zarr(store, mode="w", compute=False) @@ -3420,12 +3427,13 @@ def test_region_write(self) -> None: else: expected = { "iter": 2, - "contains": 2, + "contains": 4, "setitem": 1, - "getitem": 3, + "getitem": 4, "listdir": 2, "list_prefix": 0, } + patches = self.make_patches(store) with patch.multiple(KVStore, **patches): ds.to_zarr(store, region={"x": slice(None)}) @@ -3443,12 +3451,13 @@ def test_region_write(self) -> None: else: expected = { "iter": 2, - "contains": 2, + "contains": 4, "setitem": 1, - "getitem": 5, + "getitem": 6, "listdir": 2, "list_prefix": 0, } + patches = self.make_patches(store) with patch.multiple(KVStore, **patches): ds.to_zarr(store, region="auto") @@ -3463,13 +3472,14 @@ def test_region_write(self) -> None: } else: expected = { - "iter": 1, - "contains": 2, - "setitem": 0, - "getitem": 5, - "listdir": 1, + "iter": 2, + "contains": 4, + "setitem": 1, + "getitem": 6, + "listdir": 2, "list_prefix": 0, } + patches = self.make_patches(store) with patch.multiple(KVStore, **patches): with open_dataset(store, engine="zarr") as actual: From a84fa795230b94a06e3999cb31d0ff28a878147e Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Mon, 14 Oct 2024 11:58:29 -0700 Subject: [PATCH 53/74] skip datatree zarr tests w/ zarr 3 for now --- xarray/tests/test_backends_datatree.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index e109ea5d38d..a062fdbacc6 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -24,6 +24,8 @@ except ImportError: pass +have_zarr_v3 = xr.backends.zarr._zarr_v3() + @pytest.fixture(scope="module") def unaligned_datatree_nc(tmp_path_factory): @@ -243,6 +245,9 @@ class TestH5NetCDFDatatreeIO(DatatreeIOBase): engine: T_DataTreeNetcdfEngine | None = "h5netcdf" +@pytest.mark.skipif( + have_zarr_v3, reason="datatree support for zarr 3 is not implemented yet" +) @requires_zarr class TestZarrDatatreeIO: engine = "zarr" From 9fec1d6d0f39934ce4315ed490823ccfb16af7ff Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 14 Oct 2024 14:02:12 -0500 Subject: [PATCH 54/74] fixed kvstore usage --- xarray/tests/test_backends.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index c64e7a36da7..a2ffbb298e1 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3655,16 +3655,19 @@ def test_zarr_storage_options() -> None: @requires_zarr def test_zarr_version_deprecated() -> None: ds = create_test_data() - store = KVStore({}) + if have_zarr_v3: + store = KVStore() + else: + store = {} with pytest.warns(FutureWarning, match="zarr_version"): - ds.to_zarr(store, zarr_version=2) + ds.to_zarr(store=store, zarr_version=2) with pytest.warns(FutureWarning, match="zarr_version"): - xr.open_zarr(store, zarr_version=2) + xr.open_zarr(store=store, zarr_version=2) with pytest.raises(ValueError, match="zarr_format"): - xr.open_zarr(store, zarr_version=2, zarr_format=3) + xr.open_zarr(store=store, zarr_version=2, zarr_format=3) @requires_scipy From 04c017ea6ab000f4e529a90fd5955b5bc3da4e0d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 14 Oct 2024 14:09:12 -0500 Subject: [PATCH 55/74] typing fixes --- xarray/tests/test_backends.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a2ffbb298e1..b5e8116d493 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -24,6 +24,7 @@ import numpy as np import pandas as pd import pytest +import zarr.codecs from packaging.version import Version from pandas.errors import OutOfBoundsDatetime @@ -126,14 +127,16 @@ else: ZARR_FORMATS = [2] try: - from zarr import KVStoreV3 as KVStore + from zarr import ( + KVStoreV3 as KVStore, # type: ignore[attr-defined,no-redef,unused-ignore] + ) except ImportError: - KVStore = None + KVStore = None # type: ignore[misc,unused-ignore] else: have_zarr_v3 = False have_zarr_kvstore = False have_zarr_directory_store = False - KVStore = None + KVStore = None # type: ignore[misc,unused-ignore] ZARR_FORMATS = [] @@ -2817,6 +2820,7 @@ def test_check_encoding_is_consistent_after_append(self) -> None: with self.create_zarr_target() as store_target: import numcodecs + encoding_value: Any if have_zarr_v3 and zarr.config.config["default_zarr_version"] == 3: compressor = zarr.codecs.BloscCodec() encoding_key = "codecs" @@ -3277,7 +3281,7 @@ def create_zarr_target(self): if have_zarr_v3: kwargs = {"mode": "a"} else: - kwargs = {} + kwargs = {} # type: ignore[arg-type,unused-ignore] store = KVStore({}, **kwargs) yield store From 625591ea936d2d28444627f76d9f03f433628835 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 14 Oct 2024 14:23:40 -0500 Subject: [PATCH 56/74] move zarr.codecs import --- xarray/tests/test_backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index b5e8116d493..c41bf43d41f 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -24,7 +24,6 @@ import numpy as np import pandas as pd import pytest -import zarr.codecs from packaging.version import Version from pandas.errors import OutOfBoundsDatetime @@ -110,6 +109,7 @@ try: import zarr + import zarr.codecs except ImportError: have_zarr = False else: From 3795b0761a70068bf0d060beaa45a05d67dedbcf Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 14 Oct 2024 15:10:28 -0500 Subject: [PATCH 57/74] fixup ignores --- xarray/tests/test_backends.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index c41bf43d41f..8a8818183bd 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -127,11 +127,11 @@ else: ZARR_FORMATS = [2] try: - from zarr import ( - KVStoreV3 as KVStore, # type: ignore[attr-defined,no-redef,unused-ignore] + from zarr import ( # type: ignore[attr-defined,no-redef,unused-ignore] + KVStoreV3 as KVStore, ) except ImportError: - KVStore = None # type: ignore[misc,unused-ignore] + KVStore = None # type: ignore[assignment,misc,unused-ignore] else: have_zarr_v3 = False have_zarr_kvstore = False @@ -3283,7 +3283,7 @@ def create_zarr_target(self): else: kwargs = {} # type: ignore[arg-type,unused-ignore] - store = KVStore({}, **kwargs) + store = KVStore({}, **kwargs) # type: ignore[arg-type,unused-ignore] yield store def make_patches(self, store): @@ -3628,7 +3628,7 @@ def test_avoid_excess_metadata_calls(self) -> None: else: Group = zarr.Group patched = patch.object( - Group, "__getitem__", side_effect=Group.__getitem__, autospec=True + Group, "__getitem__", side_effect=Group.__getitem__, autospec=True # type: ignore[assignment,unused-ignore] ) with self.create_zarr_target() as store, patched as mock: @@ -3659,6 +3659,7 @@ def test_zarr_storage_options() -> None: @requires_zarr def test_zarr_version_deprecated() -> None: ds = create_test_data() + store: Any if have_zarr_v3: store = KVStore() else: From ea2cb578f21a1c4a0983dc93809b50488194111a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 14 Oct 2024 15:24:08 -0500 Subject: [PATCH 58/74] storage options fix, skip --- xarray/backends/api.py | 19 +++++++++++++------ xarray/tests/test_backends.py | 1 + 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index cb2ffb30690..2068fc95701 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -33,6 +33,7 @@ _normalize_path, ) from xarray.backends.locks import _get_scheduler +from xarray.backends.zarr import _zarr_v3 from xarray.core import indexing from xarray.core.combine import ( _infer_concat_order_from_positions, @@ -1700,21 +1701,26 @@ def to_zarr( store = _normalize_path(store) chunk_store = _normalize_path(chunk_store) + kwargs = {} if storage_options is None: mapper = store chunk_mapper = chunk_store else: - from fsspec import get_mapper - if not isinstance(store, str): raise ValueError( f"store must be a string to use storage_options. Got {type(store)}" ) - mapper = get_mapper(store, **storage_options) - if chunk_store is not None: - chunk_mapper = get_mapper(chunk_store, **storage_options) + + if _zarr_v3(): + kwargs["storage_options"] = storage_options else: - chunk_mapper = chunk_store + from fsspec import get_mapper + + mapper = get_mapper(store, **storage_options) + if chunk_store is not None: + chunk_mapper = get_mapper(chunk_store, **storage_options) + else: + chunk_mapper = chunk_store if encoding is None: encoding = {} @@ -1765,6 +1771,7 @@ def to_zarr( zarr_version=zarr_version, zarr_format=zarr_format, write_empty=write_empty_chunks, + **kwargs, ) if region is not None: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 8a8818183bd..92f3834bf60 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3647,6 +3647,7 @@ def test_avoid_excess_metadata_calls(self) -> None: @requires_zarr @requires_fsspec +@pytest.mark.skipif(have_zarr_v3, reason="Difficult to test.") def test_zarr_storage_options() -> None: pytest.importorskip("aiobotocore") ds = create_test_data() From 4f617d2a45db88ab910ab032bf652bc4a217b925 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 14 Oct 2024 15:24:16 -0500 Subject: [PATCH 59/74] fixed types --- xarray/tests/test_backends.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 92f3834bf60..42bc5ba59ad 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -136,7 +136,7 @@ have_zarr_v3 = False have_zarr_kvstore = False have_zarr_directory_store = False - KVStore = None # type: ignore[misc,unused-ignore] + KVStore = None # type: ignore[assignment,misc,unused-ignore] ZARR_FORMATS = [] @@ -3620,6 +3620,7 @@ def test_avoid_excess_metadata_calls(self) -> None: # Use of side_effect means that calls are passed through to the original method # rather than a mocked method. + Group: Any if have_zarr_v3: Group = zarr.AsyncGroup patched = patch.object( @@ -3628,7 +3629,7 @@ def test_avoid_excess_metadata_calls(self) -> None: else: Group = zarr.Group patched = patch.object( - Group, "__getitem__", side_effect=Group.__getitem__, autospec=True # type: ignore[assignment,unused-ignore] + Group, "__getitem__", side_effect=Group.__getitem__, autospec=True ) with self.create_zarr_target() as store, patched as mock: From d1e3c732bf7c29f946be7ef0d6303ee1c3485ef8 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Mon, 14 Oct 2024 15:29:43 -0700 Subject: [PATCH 60/74] Update ci/install-upstream-wheels.sh --- ci/install-upstream-wheels.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/install-upstream-wheels.sh b/ci/install-upstream-wheels.sh index e9e2d46e460..a2c41242963 100755 --- a/ci/install-upstream-wheels.sh +++ b/ci/install-upstream-wheels.sh @@ -53,7 +53,7 @@ python -m pip install \ git+https://github.com/dask/dask \ git+https://github.com/dask/dask-expr \ git+https://github.com/dask/distributed \ - git+https://github.com/zarr-developers/zarr.git@v3 \ + git+https://github.com/zarr-developers/zarr \ git+https://github.com/Unidata/cftime \ git+https://github.com/pypa/packaging \ git+https://github.com/hgrecco/pint \ From 45d5a78df7d5ec09c0c63881c93fbdbefd778670 Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Mon, 14 Oct 2024 16:11:46 -0700 Subject: [PATCH 61/74] type fixes --- xarray/backends/zarr.py | 2 +- xarray/tests/test_backends_datatree.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 05a7a43b975..f2010f34462 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1559,7 +1559,7 @@ def _iter_zarr_groups(root: ZarrGroup, parent: str = "/") -> Iterable[str]: for path, group in root.groups(): gpath = parent_nodepath / path yield str(gpath) - yield from _iter_zarr_groups(group, parent=gpath) + yield from _iter_zarr_groups(group, parent=str(gpath)) def _get_open_params( diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index a062fdbacc6..9252d9c09ab 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -286,7 +286,7 @@ def test_to_zarr_zip_store(self, tmpdir, simple_datatree): store = ZipStore(filepath) original_dt.to_zarr(store) - roundtrip_dt = open_datatree(store, engine="zarr") + roundtrip_dt = open_datatree(store, engine="zarr") # type: ignore[arg-type] assert_equal(original_dt, roundtrip_dt) def test_to_zarr_not_consolidated(self, tmpdir, simple_datatree): From f208c39fe3add110c896620daab071a14e35b6a1 Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Mon, 14 Oct 2024 16:18:17 -0700 Subject: [PATCH 62/74] whats-new --- doc/whats-new.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b374721c8ee..a145e688d4f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -37,6 +37,10 @@ New Features By `Holly Mandel `_. - Implement handling of complex numbers (netcdf4/h5netcdf) and enums (h5netcdf) (:issue:`9246`, :issue:`3297`, :pull:`9509`). By `Kai Mühlbauer `_. +- Support for Zarr-Python 3 (:issue:`95515`, :pull:`9552`). + By `Tom Augspurger `_, + `Ryan Abernathey `_ and + `Joe Hamman `_. Breaking changes ~~~~~~~~~~~~~~~~ From 968217ce9adb2142c55dc9aec5edbf403668f47a Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Mon, 14 Oct 2024 16:24:22 -0700 Subject: [PATCH 63/74] Update xarray/tests/test_backends_datatree.py --- xarray/tests/test_backends_datatree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 9252d9c09ab..a062fdbacc6 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -286,7 +286,7 @@ def test_to_zarr_zip_store(self, tmpdir, simple_datatree): store = ZipStore(filepath) original_dt.to_zarr(store) - roundtrip_dt = open_datatree(store, engine="zarr") # type: ignore[arg-type] + roundtrip_dt = open_datatree(store, engine="zarr") assert_equal(original_dt, roundtrip_dt) def test_to_zarr_not_consolidated(self, tmpdir, simple_datatree): From 45a37f63f16c99c9b4cb491a9a7c64d9873a9733 Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Mon, 14 Oct 2024 16:34:49 -0700 Subject: [PATCH 64/74] fix type import --- xarray/core/types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/types.py b/xarray/core/types.py index 1d383d550ec..33755c753c7 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -66,7 +66,7 @@ CubedArray = np.ndarray try: - from zarr.core import Array as ZarrArray + from zarr import Array as ZarrArray except ImportError: ZarrArray = np.ndarray From c10bfc0de120b3c7777151b73dbc2f60d2a9ffa2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 14 Oct 2024 21:13:06 -0500 Subject: [PATCH 65/74] set mapper, chunk_mapper --- xarray/backends/api.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 2068fc95701..4188e4e347b 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1713,6 +1713,8 @@ def to_zarr( if _zarr_v3(): kwargs["storage_options"] = storage_options + mapper = store + chunk_mapper = chunk_store else: from fsspec import get_mapper From 82e6a6d380a4414f8310e4afde42b9985c2527d7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 15 Oct 2024 07:28:30 -0500 Subject: [PATCH 66/74] Pass through zarr_format --- xarray/backends/zarr.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index f2010f34462..2f5adc17bb1 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -992,7 +992,6 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No # TODO: see if zarr should normalize these strings. zarr_array = zarr.open( **kwargs, - # path=f"{self.zarr_group.name}/{name}", path="/".join([self.zarr_group.name.rstrip("/"), name]).lstrip( "/" ), @@ -1457,7 +1456,7 @@ def open_datatree( storage_options=None, stacklevel=3, zarr_version=None, - # TODO: zarr_format + zarr_format=None, **kwargs, ) -> DataTree: from xarray.core.datatree import DataTree @@ -1480,6 +1479,7 @@ def open_datatree( storage_options=storage_options, stacklevel=stacklevel, zarr_version=zarr_version, + zarr_format=zarr_format, **kwargs, ) @@ -1504,6 +1504,7 @@ def open_groups_as_dict( storage_options=None, stacklevel=3, zarr_version=None, + zarr_format=None, **kwargs, ) -> dict[str, Dataset]: @@ -1528,6 +1529,7 @@ def open_groups_as_dict( storage_options=storage_options, stacklevel=stacklevel + 1, zarr_version=zarr_version, + zarr_format=zarr_format, ) groups_dict = {} From d752693b7a73ca99a119ca823de1e760349c0c19 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 15 Oct 2024 07:30:13 -0500 Subject: [PATCH 67/74] Fixup --- xarray/tests/test_backends.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 42bc5ba59ad..c37b3fc3af1 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2840,8 +2840,7 @@ def test_check_encoding_is_consistent_after_append(self) -> None: actual_ds = xr.open_dataset( store_target, engine="zarr", **self.version_kwargs ) - # assert actual_encoding.get_config() == compressor.get_config() - # TODO: check whether this approach works for v2 + actual_encoding = actual_ds["da"].encoding[encoding_key] assert original_encoding == actual_encoding assert_identical( From c2a47a128e242f81d61e96f352dbd181ec32e1b6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 15 Oct 2024 07:33:55 -0500 Subject: [PATCH 68/74] more cleanup --- xarray/backends/zarr.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 2f5adc17bb1..8231a325b7e 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -119,11 +119,6 @@ def encode_zarr_attr_value(value): """ if isinstance(value, np.ndarray): encoded = value.tolist() - # elif isinstance(value, bytes): - # try to match how Zarr encodes bytes - # return [int.from_bytes(value)] - # return value.decode("utf-8") - # this checks if it's a scalar number elif isinstance(value, np.generic): encoded = value.item() else: From 26b26610d0630b415136a2b0d1c719c2e5c93adb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 15 Oct 2024 08:50:19 -0500 Subject: [PATCH 69/74] revert test changes --- xarray/tests/test_backends.py | 69 ++++++++++++++++------------------- 1 file changed, 32 insertions(+), 37 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index c37b3fc3af1..28f24e69d1c 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -537,7 +537,7 @@ def test_dataset_caching(self) -> None: @pytest.mark.filterwarnings("ignore:deallocating CachingFileManager") def test_roundtrip_None_variable(self) -> None: - expected = Dataset({None: (("x", "y"), [[1, 1], [2, 3]])}) + expected = Dataset({None: (("x", "y"), [[0, 1], [2, 3]])}) with self.roundtrip(expected) as actual: assert_identical(expected, actual) @@ -655,7 +655,7 @@ def test_roundtrip_example_1_netcdf(self) -> None: def test_roundtrip_coordinates(self) -> None: original = Dataset( - {"foo": ("x", [1, 1])}, {"x": [2, 3], "y": ("a", [42]), "z": ("x", [4, 5])} + {"foo": ("x", [0, 1])}, {"x": [2, 3], "y": ("a", [42]), "z": ("x", [4, 5])} ) with self.roundtrip(original) as actual: @@ -671,7 +671,7 @@ def test_roundtrip_coordinates(self) -> None: def test_roundtrip_global_coordinates(self) -> None: original = Dataset( - {"foo": ("x", [1, 1])}, {"x": [2, 3], "y": ("a", [42]), "z": ("x", [4, 5])} + {"foo": ("x", [0, 1])}, {"x": [2, 3], "y": ("a", [42]), "z": ("x", [4, 5])} ) with self.roundtrip(original) as actual: assert_identical(original, actual) @@ -687,8 +687,8 @@ def test_roundtrip_global_coordinates(self) -> None: assert attrs["coordinates"] == "foo" def test_roundtrip_coordinates_with_space(self) -> None: - original = Dataset(coords={"x": 1, "y z": 1}) - expected = Dataset({"y z": 1}, {"x": 1}) + original = Dataset(coords={"x": 0, "y z": 1}) + expected = Dataset({"y z": 1}, {"x": 0}) with pytest.warns(SerializationWarning): with self.roundtrip(original) as actual: assert_identical(expected, actual) @@ -880,7 +880,7 @@ def test_dropna(self) -> None: a = np.random.randn(4, 3) a[1, 1] = np.nan in_memory = xr.Dataset( - {"a": (("y", "x"), a)}, coords={"y": np.arange(1, 5), "x": np.arange(1, 4)} + {"a": (("y", "x"), a)}, coords={"y": np.arange(4), "x": np.arange(3)} ) assert_identical( @@ -1101,11 +1101,11 @@ def _create_cf_dataset(): ), ), dict( - latitude=("latitude", [-1, 1], {"units": "degrees_north"}), - longitude=("longitude", [-1, 1], {"units": "degrees_east"}), + latitude=("latitude", [0, 1], {"units": "degrees_north"}), + longitude=("longitude", [0, 1], {"units": "degrees_east"}), latlon=((), -1, {"grid_mapping_name": "latitude_longitude"}), - latitude_bnds=(("latitude", "bnds2"), [[-1, 1], [1, 2]]), - longitude_bnds=(("longitude", "bnds2"), [[-1, 1], [1, 2]]), + latitude_bnds=(("latitude", "bnds2"), [[0, 1], [1, 2]]), + longitude_bnds=(("longitude", "bnds2"), [[0, 1], [1, 2]]), areas=( ("latitude", "longitude"), [[1, 1], [1, 1]], @@ -1199,7 +1199,7 @@ def equals_latlon(obj): return obj == "lat lon" or obj == "lon lat" original = Dataset( - {"temp": ("x", [1, 1]), "precip": ("x", [-1, -1])}, + {"temp": ("x", [0, 1]), "precip": ("x", [0, -1])}, {"lat": ("x", [2, 3]), "lon": ("x", [4, 5])}, ) with self.roundtrip(original) as actual: @@ -1330,7 +1330,6 @@ def test_default_fill_value(self) -> None: with warnings.catch_warnings(): warnings.filterwarnings("ignore", ".*floating point data as an integer") with self.roundtrip(ds, save_kwargs=kwargs) as actual: - # XXX: zarr v3 always writes fill_value so this fails. assert "_FillValue" not in actual.x.encoding assert ds.x.encoding == {} @@ -3067,7 +3066,7 @@ def test_write_preexisting_override_metadata(self) -> None: assert_identical(actual, only_new_data) def test_write_region_errors(self) -> None: - data = Dataset({"u": (("x",), np.arange(1, 6))}) + data = Dataset({"u": (("x",), np.arange(5))}) data2 = Dataset({"u": (("x",), np.array([10, 11]))}) @contextlib.contextmanager @@ -3079,7 +3078,7 @@ def setup_and_verify_store(expected=data): assert_identical(actual, expected) # verify the base case works - expected = Dataset({"u": (("x",), np.array([10, 11, 3, 4, 5]))}) + expected = Dataset({"u": (("x",), np.array([10, 11, 2, 3, 4]))}) with setup_and_verify_store(expected) as store: data2.to_zarr(store, region={"x": slice(2)}, **self.version_kwargs) @@ -3150,9 +3149,9 @@ def test_encoding_chunksizes(self) -> None: original = xr.Dataset( {}, coords={ - "x": np.arange(nx) + 1, - "y": np.arange(ny) + 1, - "t": np.arange(nt) + 1, + "x": np.arange(nx), + "y": np.arange(ny), + "t": np.arange(nt), }, ) original["v"] = xr.Variable(("x", "y", "t"), np.zeros((nx, ny, nt))) @@ -3313,7 +3312,7 @@ def check_requests(self, expected, patches): assert summary[k] <= expected[k], (k, summary) def test_append(self) -> None: - original = Dataset({"foo": ("x", [1])}, coords={"x": [-1]}) + original = Dataset({"foo": ("x", [1])}, coords={"x": [0]}) modified = Dataset({"foo": ("x", [2])}, coords={"x": [1]}) with self.create_zarr_target() as store: @@ -5422,7 +5421,7 @@ def skip_if_zarr_python_3_and_zip_store(self, store) -> None: def test_dataarray_to_zarr_no_name(self, tmp_store) -> None: self.skip_if_zarr_python_3_and_zip_store(tmp_store) - original_da = DataArray(np.arange(1, 13).reshape((3, 4))) + original_da = DataArray(np.arange(12).reshape((3, 4))) original_da.to_zarr(tmp_store) @@ -5431,7 +5430,7 @@ def test_dataarray_to_zarr_no_name(self, tmp_store) -> None: def test_dataarray_to_zarr_with_name(self, tmp_store) -> None: self.skip_if_zarr_python_3_and_zip_store(tmp_store) - original_da = DataArray(np.arange(12).reshape((3, 4)) + 1, name="test") + original_da = DataArray(np.arange(12).reshape((3, 4)), name="test") original_da.to_zarr(tmp_store) @@ -5441,7 +5440,7 @@ def test_dataarray_to_zarr_with_name(self, tmp_store) -> None: def test_dataarray_to_zarr_coord_name_clash(self, tmp_store) -> None: self.skip_if_zarr_python_3_and_zip_store(tmp_store) original_da = DataArray( - np.arange(12).reshape((3, 4)) + 1, dims=["x", "y"], name="x" + np.arange(12).reshape((3, 4)), dims=["x", "y"], name="x" ) original_da.to_zarr(tmp_store) @@ -5451,7 +5450,7 @@ def test_dataarray_to_zarr_coord_name_clash(self, tmp_store) -> None: def test_open_dataarray_options(self, tmp_store) -> None: self.skip_if_zarr_python_3_and_zip_store(tmp_store) - data = DataArray(np.arange(5) + 1, coords={"y": ("x", range(1, 6))}, dims=["x"]) + data = DataArray(np.arange(5), coords={"y": ("x", range(1, 6))}, dims=["x"]) data.to_zarr(tmp_store) @@ -5464,7 +5463,7 @@ def test_dataarray_to_zarr_compute_false(self, tmp_store) -> None: from dask.delayed import Delayed skip_if_zarr_format_3(tmp_store) - original_da = DataArray(np.arange(12).reshape((3, 4)) + 1) + original_da = DataArray(np.arange(12).reshape((3, 4))) output = original_da.to_zarr(tmp_store, compute=False) assert isinstance(output, Delayed) @@ -6033,7 +6032,7 @@ def test_pickle_open_mfdataset_dataset(): @pytest.mark.usefixtures("default_zarr_version") def test_zarr_closing_internal_zip_store(): store_name = "tmp.zarr.zip" - original_da = DataArray(np.arange(1, 13).reshape((3, 4))) + original_da = DataArray(np.arange(12).reshape((3, 4))) original_da.to_zarr(store_name, mode="w") with open_dataarray(store_name, engine="zarr") as loaded_da: @@ -6044,8 +6043,8 @@ def test_zarr_closing_internal_zip_store(): @pytest.mark.usefixtures("default_zarr_version") class TestZarrRegionAuto: def test_zarr_region_auto_all(self, tmp_path): - x = np.arange(0, 50, 10) + 1 - y = np.arange(0, 20, 2) + 1 + x = np.arange(0, 50, 10) + y = np.arange(0, 20, 2) data = np.ones((5, 10)) ds = xr.Dataset( { @@ -6068,8 +6067,8 @@ def test_zarr_region_auto_all(self, tmp_path): assert_identical(ds_updated, expected) def test_zarr_region_auto_mixed(self, tmp_path): - x = np.arange(0, 50, 10) + 1 - y = np.arange(0, 20, 2) + 1 + x = np.arange(0, 50, 10) + y = np.arange(0, 20, 2) data = np.ones((5, 10)) ds = xr.Dataset( { @@ -6094,8 +6093,8 @@ def test_zarr_region_auto_mixed(self, tmp_path): assert_identical(ds_updated, expected) def test_zarr_region_auto_noncontiguous(self, tmp_path): - x = np.arange(0, 50, 10) + 1 - y = np.arange(0, 20, 2) + 1 + x = np.arange(0, 50, 10) + y = np.arange(0, 20, 2) data = np.ones((5, 10)) ds = xr.Dataset( { @@ -6312,9 +6311,7 @@ def test_zarr_region_chunk_partial_offset(tmp_path): # https://github.com/pydata/xarray/pull/8459#issuecomment-1819417545 store = tmp_path / "foo.zarr" data = np.ones((30,)) - da = xr.DataArray(data, dims=["x"], coords={"x": range(1, 31)}, name="foo").chunk( - x=10 - ) + da = xr.DataArray(data, dims=["x"], coords={"x": range(30)}, name="foo").chunk(x=10) da.to_zarr(store, compute=False) da.isel(x=slice(10)).chunk(x=(10,)).to_zarr(store, region="auto") @@ -6333,9 +6330,7 @@ def test_zarr_region_chunk_partial_offset(tmp_path): def test_zarr_safe_chunk_append_dim(tmp_path): store = tmp_path / "foo.zarr" data = np.ones((20,)) - da = xr.DataArray(data, dims=["x"], coords={"x": range(1, 21)}, name="foo").chunk( - x=5 - ) + da = xr.DataArray(data, dims=["x"], coords={"x": range(20)}, name="foo").chunk(x=5) da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") with pytest.raises(ValueError): @@ -6387,7 +6382,7 @@ def test_zarr_safe_chunk_region(tmp_path): store = tmp_path / "foo.zarr" arr = xr.DataArray( - list(range(1, 12)), dims=["a"], coords={"a": list(range(1, 12))}, name="foo" + list(range(11)), dims=["a"], coords={"a": list(range(11))}, name="foo" ).chunk(a=3) arr.to_zarr(store, mode="w") From 1d73d36f63f8a86046db805aab8e680c952dec19 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 21 Oct 2024 14:49:36 -0600 Subject: [PATCH 70/74] Update xarray/backends/zarr.py --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 8231a325b7e..c5151e47e92 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -722,7 +722,7 @@ def open_store_variable(self, name, zarr_array=None): if self._use_zarr_fill_value_as_mask: # Setting this attribute triggers CF decoding for missing values - # TODO: it feels a bit hacky to hijack CF decoding for this purpose + # by interpreting Zarr's fill_value to mean the same as netCDF's _FillValue if zarr_array.fill_value is not None: attributes["_FillValue"] = zarr_array.fill_value elif "_FillValue" in attributes: From be79e8802f31cbd517eec60dcc590e8d26e2dcc9 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 21 Oct 2024 14:47:09 -0600 Subject: [PATCH 71/74] cleanup --- xarray/backends/zarr.py | 53 ++++++++++-------------- xarray/tests/__init__.py | 2 + xarray/tests/test_backends.py | 76 ++++++++++++++--------------------- 3 files changed, 53 insertions(+), 78 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index c5151e47e92..de2f7d12234 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1,7 +1,6 @@ from __future__ import annotations import base64 -import functools import json import os import struct @@ -10,7 +9,6 @@ from typing import TYPE_CHECKING, Any, Literal import numpy as np -import packaging.version import pandas as pd from xarray import coding, conventions @@ -30,10 +28,12 @@ FrozenDict, HiddenKeyDict, close_on_error, + emit_user_level_warning, ) from xarray.core.variable import Variable from xarray.namedarray.parallelcompat import guess_chunkmanager from xarray.namedarray.pycompat import integer_types +from xarray.namedarray.utils import module_available if TYPE_CHECKING: from io import BufferedIOBase @@ -45,14 +45,9 @@ from xarray.core.datatree import DataTree -@functools.lru_cache def _zarr_v3() -> bool: - try: - import zarr - except ImportError: - return False - else: - return packaging.version.parse(zarr.__version__).major >= 3 + # TODO: switch to "3" once Zarr V3 is released + return module_available("zarr", minversion="2.99") # need some special secret attributes to tell us the dimensions @@ -946,8 +941,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No # _FillValue is never a valid encoding for Zarr # TODO: refactor this logic so we don't need to check this here if "_FillValue" in v.encoding: - efv = v.encoding["_FillValue"] - if efv is not None: + if v.encoding.get("_FillValue") is not None: raise ValueError("Zarr does not support _FillValue in encoding.") else: del v.encoding["_FillValue"] @@ -978,15 +972,13 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No # - Existing variables already have their attrs included in the consolidated metadata file. # - The size of dimensions can not be expanded, that would require a call using `append_dim` # which is mutually exclusive with `region` - kwargs = {} - if _zarr_v3(): - kwargs["store"] = self.zarr_group.store - else: - kwargs["store"] = self.zarr_group.chunk_store - - # TODO: see if zarr should normalize these strings. zarr_array = zarr.open( - **kwargs, + store=( + self.zarr_group.store + if _zarr_v3() + else self.zarr_group.chunk_store + ), + # TODO: see if zarr should normalize these strings. path="/".join([self.zarr_group.name.rstrip("/"), name]).lstrip( "/" ), @@ -1271,18 +1263,18 @@ def open_zarr( possible, otherwise defaulting to the default version used by the zarr-python library installed. use_zarr_fill_value_as_mask : bool, optional - If True, use the zarr Array `fill_value` to mask the data, the same as done - for NetCDF data with `_FillValue` or `missing_value` attributes. If False, - the `fill_value` is ignored and the data are not masked. If None, this defaults - to True for `zarr_version=2` and False for `zarr_version=3`. + If True, use the zarr Array ``fill_value`` to mask the data, the same as done + for NetCDF data with ``_FillValue`` or ``missing_value`` attributes. If False, + the ``fill_value`` is ignored and the data are not masked. If None, this defaults + to True for ``zarr_version=2`` and False for ``zarr_version=3``. chunked_array_type: str, optional Which chunked array type to coerce this datasets' arrays to. Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEntryPoint` system. Experimental API that should not be relied upon. from_array_kwargs: dict, optional - Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create - chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. - Defaults to {'manager': 'dask'}, meaning additional kwargs will be passed eventually to + Additional keyword arguments passed on to the ``ChunkManagerEntrypoint.from_array`` method used to create + chunked arrays, via whichever chunk manager is specified through the ``chunked_array_type`` kwarg. + Defaults to ``{'manager': 'dask'}``, meaning additional kwargs will be passed eventually to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. Returns @@ -1644,10 +1636,7 @@ def _get_open_params( close_store_on_close = zarr_group.store is not store # we use this to determine how to handle fill_value - if _zarr_v3(): - is_zarr_v3_format = zarr_group.metadata.zarr_format == 3 - else: - is_zarr_v3_format = False + is_zarr_v3_format = _zarr_v3() and zarr_group.metadata.zarr_format == 3 if use_zarr_fill_value_as_mask is None: if is_zarr_v3_format: # for new data, we use a better default @@ -1676,8 +1665,8 @@ def _handle_zarr_version_or_format( f"zarr_format {zarr_format} does not match zarr_version {zarr_version}, please only set one" ) if zarr_version is not None: - warnings.warn( - "zarr_version is deprecated, use zarr_format", FutureWarning, stacklevel=6 + emit_user_level_warning( + "zarr_version is deprecated, use zarr_format", FutureWarning ) return zarr_version return zarr_format diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index bd7ec6297b9..e22ca32ba36 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -117,6 +117,8 @@ def _importorskip( has_bottleneck, requires_bottleneck = _importorskip("bottleneck") has_rasterio, requires_rasterio = _importorskip("rasterio") has_zarr, requires_zarr = _importorskip("zarr") +# TODO: switch to "3" once Zarr V3 is released +has_zarr_v3, requires_zarr_v3 = _importorskip("zarr", "2.99") has_fsspec, requires_fsspec = _importorskip("fsspec") has_iris, requires_iris = _importorskip("iris") has_numbagg, requires_numbagg = _importorskip("numbagg", "0.4.0") diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 28f24e69d1c..52dc6bab726 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -67,6 +67,8 @@ has_netCDF4, has_numpy_2, has_scipy, + has_zarr, + has_zarr_v3, mock, network, requires_cftime, @@ -107,20 +109,11 @@ pass -try: +if has_zarr: import zarr import zarr.codecs -except ImportError: - have_zarr = False -else: - have_zarr = True - - -if have_zarr: - have_zarr_v3 = backends.zarr._zarr_v3() - have_zarr_kvstore = True - if have_zarr_v3: + if has_zarr_v3: from zarr.storage import MemoryStore as KVStore ZARR_FORMATS = [2, 3] @@ -133,16 +126,13 @@ except ImportError: KVStore = None # type: ignore[assignment,misc,unused-ignore] else: - have_zarr_v3 = False - have_zarr_kvstore = False - have_zarr_directory_store = False KVStore = None # type: ignore[assignment,misc,unused-ignore] ZARR_FORMATS = [] @pytest.fixture(scope="module", params=ZARR_FORMATS) def default_zarr_version(request) -> Generator[None, None]: - if have_zarr_v3: + if has_zarr_v3: with zarr.config.set(default_zarr_version=request.param): yield else: @@ -150,13 +140,13 @@ def default_zarr_version(request) -> Generator[None, None]: def skip_if_zarr_format_3(reason: str): - if have_zarr_v3 and zarr.config["default_zarr_version"] == 3: - pytest.xfail(reason=f"Unsupported with zarr_format=3: {reason}") + if has_zarr_v3 and zarr.config["default_zarr_version"] == 3: + pytest.skip(reason=f"Unsupported with zarr_format=3: {reason}") def skip_if_zarr_format_2(reason: str): - if not have_zarr_v3 or (zarr.config["default_zarr_version"] == 2): - pytest.xfail(reason=f"Unsupported with zarr_format=2: {reason}") + if not has_zarr_v3 or (zarr.config["default_zarr_version"] == 2): + pytest.skip(reason=f"Unsupported with zarr_format=2: {reason}") ON_WINDOWS = sys.platform == "win32" @@ -1345,11 +1335,10 @@ def test_explicitly_omit_fill_value(self) -> None: with self.roundtrip(ds) as actual: assert "_FillValue" not in actual.x.encoding - # TODO: decide if this test is really necessary - # _FillValue is not a valid encoding for Zarr def test_explicitly_omit_fill_value_via_encoding_kwarg(self) -> None: ds = Dataset({"x": ("y", [np.pi, -np.pi])}) kwargs = dict(encoding={"x": {"_FillValue": None}}) + # _FillValue is not a valid encoding for Zarr with self.roundtrip(ds, save_kwargs=kwargs) as actual: assert "_FillValue" not in actual.x.encoding assert ds.y.encoding == {} @@ -2336,7 +2325,7 @@ def test_non_existent_store(self) -> None: with pytest.raises(FileNotFoundError, match="No such file or directory"): xr.open_zarr(f"{uuid.uuid4()}") - @pytest.mark.skipif(have_zarr_v3, reason="chunk_store not implemented in zarr v3") + @pytest.mark.skipif(has_zarr_v3, reason="chunk_store not implemented in zarr v3") def test_with_chunkstore(self) -> None: expected = create_test_data() with ( @@ -2587,7 +2576,7 @@ def test_hidden_zarr_keys(self) -> None: for var in expected.variables.keys(): assert self.DIMENSION_KEY not in expected[var].attrs - if have_zarr_v3: + if has_zarr_v3: # temporary workaround for https://github.com/zarr-developers/zarr-python/issues/2338 zarr_group.store._is_open = True @@ -2652,7 +2641,7 @@ def test_compressor_encoding(self) -> None: original = create_test_data() # specify a custom compressor - if have_zarr_v3 and zarr.config.config["default_zarr_version"] == 3: + if has_zarr_v3 and zarr.config.config["default_zarr_version"] == 3: encoding_key = "codecs" # all parameters need to be explicitly specified in order for the comparison to pass below encoding = { @@ -2677,7 +2666,7 @@ def test_compressor_encoding(self) -> None: with self.roundtrip(original, save_kwargs=save_kwargs) as ds: enc = ds["var1"].encoding[encoding_key] - if have_zarr_v3 and zarr.config.config["default_zarr_version"] == 3: + if has_zarr_v3 and zarr.config.config["default_zarr_version"] == 3: # TODO: figure out a cleaner way to do this comparison codecs = zarr.core.metadata.v3.parse_codecs(enc) assert codecs == encoding[encoding_key] @@ -2820,7 +2809,7 @@ def test_check_encoding_is_consistent_after_append(self) -> None: import numcodecs encoding_value: Any - if have_zarr_v3 and zarr.config.config["default_zarr_version"] == 3: + if has_zarr_v3 and zarr.config.config["default_zarr_version"] == 3: compressor = zarr.codecs.BloscCodec() encoding_key = "codecs" encoding_value = [zarr.codecs.BytesCodec(), compressor] @@ -3250,8 +3239,7 @@ def test_chunked_cftime_datetime(self) -> None: KVStore is None, reason="zarr-python 2.x or ZARR_V3_EXPERIMENTAL_API is unset." ) class TestInstrumentedZarrStore: - - if have_zarr_v3: + if has_zarr_v3: methods = [ "get", "set", @@ -3270,13 +3258,10 @@ class TestInstrumentedZarrStore: @contextlib.contextmanager def create_zarr_target(self): - if not have_zarr: - pytest.skip("Instrumented tests only work on latest Zarr.") - if Version(zarr.__version__) < Version("2.18.0"): pytest.skip("Instrumented tests only work on latest Zarr.") - if have_zarr_v3: + if has_zarr_v3: kwargs = {"mode": "a"} else: kwargs = {} # type: ignore[arg-type,unused-ignore] @@ -3316,7 +3301,7 @@ def test_append(self) -> None: modified = Dataset({"foo": ("x", [2])}, coords={"x": [1]}) with self.create_zarr_target() as store: - if have_zarr_v3: + if has_zarr_v3: # TOOD: verify these expected = { "set": 17, @@ -3342,7 +3327,7 @@ def test_append(self) -> None: patches = self.make_patches(store) # v2024.03.0: {'iter': 6, 'contains': 2, 'setitem': 5, 'getitem': 10, 'listdir': 6, 'list_prefix': 0} # 6057128b: {'iter': 5, 'contains': 2, 'setitem': 5, 'getitem': 10, "listdir": 5, "list_prefix": 0} - if have_zarr_v3: + if has_zarr_v3: expected = { "set": 10, "get": 16, # TODO: fixme upstream (should be 8) @@ -3365,7 +3350,7 @@ def test_append(self) -> None: patches = self.make_patches(store) - if have_zarr_v3: + if has_zarr_v3: expected = { "set": 10, "get": 16, # TODO: fixme upstream (should be 8) @@ -3395,7 +3380,7 @@ def test_append(self) -> None: def test_region_write(self) -> None: ds = Dataset({"foo": ("x", [1, 2, 3])}, coords={"x": [1, 2, 3]}).chunk() with self.create_zarr_target() as store: - if have_zarr_v3: + if has_zarr_v3: expected = { "set": 5, "get": 10, @@ -3419,7 +3404,7 @@ def test_region_write(self) -> None: # v2024.03.0: {'iter': 5, 'contains': 2, 'setitem': 1, 'getitem': 6, 'listdir': 5, 'list_prefix': 0} # 6057128b: {'iter': 4, 'contains': 2, 'setitem': 1, 'getitem': 5, 'listdir': 4, 'list_prefix': 0} - if have_zarr_v3: + if has_zarr_v3: expected = { "set": 1, "get": 3, @@ -3443,7 +3428,7 @@ def test_region_write(self) -> None: # v2024.03.0: {'iter': 6, 'contains': 4, 'setitem': 1, 'getitem': 11, 'listdir': 6, 'list_prefix': 0} # 6057128b: {'iter': 4, 'contains': 2, 'setitem': 1, 'getitem': 7, 'listdir': 4, 'list_prefix': 0} - if have_zarr_v3: + if has_zarr_v3: expected = { "set": 1, "get": 5, @@ -3465,7 +3450,7 @@ def test_region_write(self) -> None: ds.to_zarr(store, region="auto") self.check_requests(expected, patches) - if have_zarr_v3: + if has_zarr_v3: expected = { "set": 0, "get": 5, @@ -3493,7 +3478,7 @@ def test_region_write(self) -> None: class TestZarrDictStore(ZarrBase): @contextlib.contextmanager def create_zarr_target(self): - if have_zarr_v3: + if has_zarr_v3: yield zarr.storage.MemoryStore({}, mode="a") else: yield {} @@ -3548,7 +3533,7 @@ def roundtrip_dir( @pytest.mark.parametrize("consolidated", [True, False, None]) @pytest.mark.parametrize("write_empty", [True, False, None]) @pytest.mark.skipif( - have_zarr_v3, reason="zarr-python 3.x removed write_empty_chunks" + has_zarr_v3, reason="zarr-python 3.x removed write_empty_chunks" ) def test_write_empty( self, consolidated: bool | None, write_empty: bool | None @@ -3619,7 +3604,7 @@ def test_avoid_excess_metadata_calls(self) -> None: # rather than a mocked method. Group: Any - if have_zarr_v3: + if has_zarr_v3: Group = zarr.AsyncGroup patched = patch.object( Group, "getitem", side_effect=Group.getitem, autospec=True @@ -3646,7 +3631,7 @@ def test_avoid_excess_metadata_calls(self) -> None: @requires_zarr @requires_fsspec -@pytest.mark.skipif(have_zarr_v3, reason="Difficult to test.") +@pytest.mark.skipif(has_zarr_v3, reason="Difficult to test.") def test_zarr_storage_options() -> None: pytest.importorskip("aiobotocore") ds = create_test_data() @@ -3660,7 +3645,7 @@ def test_zarr_storage_options() -> None: def test_zarr_version_deprecated() -> None: ds = create_test_data() store: Any - if have_zarr_v3: + if has_zarr_v3: store = KVStore() else: store = {} @@ -5412,9 +5397,8 @@ def test_dataarray_to_netcdf_no_name_pathlib(self) -> None: @requires_zarr class TestDataArrayToZarr: - def skip_if_zarr_python_3_and_zip_store(self, store) -> None: - if have_zarr_v3 and isinstance(store, zarr.storage.zip.ZipStore): + if has_zarr_v3 and isinstance(store, zarr.storage.zip.ZipStore): pytest.skip( reason="zarr-python 3.x doesn't support reopening ZipStore with a new mode." ) From ff0f2c0d481b66fd4a97937d6886260690620ae0 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 21 Oct 2024 15:01:10 -0600 Subject: [PATCH 72/74] update docstring --- xarray/core/dataarray.py | 8 ++++++++ xarray/core/dataset.py | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 8d460e492c6..0235ce17b7b 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -4342,6 +4342,14 @@ def to_zarr( * ``dask.delayed.Delayed`` if compute is False * ZarrStore otherwise + Notes + ----- + There exists a subtlety in interpreting zarr's ``fill_value`` property. For zarr v2 format + arrays, ``fill_value`` is *always* interpreted as an invalid value similar to the ``_FillValue`` attribute + in CF/netCDF. For Zarr v3 format arrays, only an explicit ``_FillValue`` attribute will be used + to mask the data if requested using ``mask_and_scale=True``. See this :ref:`Github issue `_ + for more. + References ---------- https://zarr.readthedocs.io/ diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 22e14e08a7c..b1e65b2252b 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2561,6 +2561,14 @@ def to_zarr( * ``dask.delayed.Delayed`` if compute is False * ZarrStore otherwise + Notes + ----- + There exists a subtlety in interpreting zarr's ``fill_value`` property. For zarr v2 format + arrays, ``fill_value`` is *always* interpreted as an invalid value similar to the ``_FillValue`` attribute + in CF/netCDF. For Zarr v3 format arrays, only an explicit ``_FillValue`` attribute will be used + to mask the data if requested using ``mask_and_scale=True``. See this :ref:`Github issue `_ + for more. + References ---------- https://zarr.readthedocs.io/ From 268e3eb1c54d37a894e14ff69ab5a45da5f80700 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 22 Oct 2024 07:13:00 -0600 Subject: [PATCH 73/74] fix rtd --- xarray/core/dataarray.py | 2 +- xarray/core/dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index a35fc5caf8b..d93e9745f9c 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -4348,7 +4348,7 @@ def to_zarr( There exists a subtlety in interpreting zarr's ``fill_value`` property. For zarr v2 format arrays, ``fill_value`` is *always* interpreted as an invalid value similar to the ``_FillValue`` attribute in CF/netCDF. For Zarr v3 format arrays, only an explicit ``_FillValue`` attribute will be used - to mask the data if requested using ``mask_and_scale=True``. See this :ref:`Github issue `_ + to mask the data if requested using ``mask_and_scale=True``. See this `Github issue `_ for more. References diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index fa3dca9d2e4..317f0e6597e 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2566,7 +2566,7 @@ def to_zarr( There exists a subtlety in interpreting zarr's ``fill_value`` property. For zarr v2 format arrays, ``fill_value`` is *always* interpreted as an invalid value similar to the ``_FillValue`` attribute in CF/netCDF. For Zarr v3 format arrays, only an explicit ``_FillValue`` attribute will be used - to mask the data if requested using ``mask_and_scale=True``. See this :ref:`Github issue `_ + to mask the data if requested using ``mask_and_scale=True``. See this `Github issue `_ for more. References From 7682bf4e2ab63612d1f32d238b68a6f241ca1bf1 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 22 Oct 2024 09:36:07 -0600 Subject: [PATCH 74/74] tweak --- xarray/core/dataarray.py | 15 +++++++-------- xarray/core/dataset.py | 15 +++++++-------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index d93e9745f9c..826acc7c7e9 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -4343,14 +4343,6 @@ def to_zarr( * ``dask.delayed.Delayed`` if compute is False * ZarrStore otherwise - Notes - ----- - There exists a subtlety in interpreting zarr's ``fill_value`` property. For zarr v2 format - arrays, ``fill_value`` is *always* interpreted as an invalid value similar to the ``_FillValue`` attribute - in CF/netCDF. For Zarr v3 format arrays, only an explicit ``_FillValue`` attribute will be used - to mask the data if requested using ``mask_and_scale=True``. See this `Github issue `_ - for more. - References ---------- https://zarr.readthedocs.io/ @@ -4368,6 +4360,13 @@ def to_zarr( The encoding attribute (if exists) of the DataArray(s) will be used. Override any existing encodings by providing the ``encoding`` kwarg. + ``fill_value`` handling: + There exists a subtlety in interpreting zarr's ``fill_value`` property. For zarr v2 format + arrays, ``fill_value`` is *always* interpreted as an invalid value similar to the ``_FillValue`` attribute + in CF/netCDF. For Zarr v3 format arrays, only an explicit ``_FillValue`` attribute will be used + to mask the data if requested using ``mask_and_scale=True``. See this `Github issue `_ + for more. + See Also -------- Dataset.to_zarr diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 317f0e6597e..408987eed53 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2561,14 +2561,6 @@ def to_zarr( * ``dask.delayed.Delayed`` if compute is False * ZarrStore otherwise - Notes - ----- - There exists a subtlety in interpreting zarr's ``fill_value`` property. For zarr v2 format - arrays, ``fill_value`` is *always* interpreted as an invalid value similar to the ``_FillValue`` attribute - in CF/netCDF. For Zarr v3 format arrays, only an explicit ``_FillValue`` attribute will be used - to mask the data if requested using ``mask_and_scale=True``. See this `Github issue `_ - for more. - References ---------- https://zarr.readthedocs.io/ @@ -2586,6 +2578,13 @@ def to_zarr( The encoding attribute (if exists) of the DataArray(s) will be used. Override any existing encodings by providing the ``encoding`` kwarg. + ``fill_value`` handling: + There exists a subtlety in interpreting zarr's ``fill_value`` property. For zarr v2 format + arrays, ``fill_value`` is *always* interpreted as an invalid value similar to the ``_FillValue`` attribute + in CF/netCDF. For Zarr v3 format arrays, only an explicit ``_FillValue`` attribute will be used + to mask the data if requested using ``mask_and_scale=True``. See this `Github issue `_ + for more. + See Also -------- :ref:`io.zarr`