Skip to content

Commit

Permalink
Merge branch 'main' into in-mem-icechunk
Browse files Browse the repository at this point in the history
  • Loading branch information
TomNicholas authored Jan 29, 2025
2 parents 2f1eed1 + 1160f3a commit a57c8ce
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 7 deletions.
12 changes: 10 additions & 2 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -228,8 +228,16 @@ You can see that the dataset contains a mixture of virtual variables backed by `
Loading variables can be useful in a few scenarios:
1. You need to look at the actual values of a multi-dimensional variable in order to decide what to do next,
2. You want in-memory indexes to use with ``xr.combine_by_coords``,
3. Storing a variable on-disk as a set of references would be inefficient, e.g. because each chunk is very small (saving the values like this is similar to kerchunk's concept of "inlining" data),
4. The variable has complicated encoding, and the simplest way to decode it correctly is to let xarray's standard decoding machinery load it into memory and apply the decoding.
3. Storing a variable on-disk as a set of references would be inefficient, e.g. because it's a very small array (saving the values like this is similar to kerchunk's concept of "inlining" data),
4. The variable has encoding, and the simplest way to decode it correctly is to let xarray's standard decoding machinery load it into memory and apply the decoding,
5. Some of your variables have inconsistent-length chunks, and you want to be able to concatenate them together. For example you might have multiple virtual datasets with coordinates of inconsistent length (e.g., leap years within multi-year daily data).

### Loading low-dimensional coordinates

In general, it is recommended to load all of your low-dimensional coordinates.
This will slow down your initial opening of the individual virtual datasets, but by loading your coordinates into memory, they can be inlined in the reference file for fast reads of the virtualized store.
However, doing this for coordinates that are N-dimensional might use a lot of storage duplicating them.
Also, anything duplicated could become out of sync with the referenced original files, especially if not using a transactional storage engine like `Icechunk`.

### CF-encoded time variables

Expand Down
2 changes: 1 addition & 1 deletion virtualizarr/readers/dmrpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def open_virtual_dataset(
virtual_backend_kwargs: Optional[dict] = None,
reader_options: Optional[dict] = None,
) -> Dataset:
loadable_variables, drop_variables = check_for_collisions(
drop_variables, loadable_variables = check_for_collisions(
drop_variables=drop_variables,
loadable_variables=loadable_variables,
)
Expand Down
2 changes: 1 addition & 1 deletion virtualizarr/readers/kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def open_virtual_dataset(
if group:
raise NotImplementedError()

loadable_variables, drop_variables = check_for_collisions(
drop_variables, loadable_variables = check_for_collisions(
drop_variables=drop_variables,
loadable_variables=loadable_variables,
)
Expand Down
8 changes: 5 additions & 3 deletions virtualizarr/tests/test_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,13 +234,15 @@ class TestReadFromURL:
"netcdf4",
"https://github.com/pydata/xarray-data/raw/master/ROMS_example.nc",
),
(
pytest.param(
"hdf4",
"https://github.com/corteva/rioxarray/raw/master/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf",
marks=pytest.mark.skip(reason="often times out"),
),
(
pytest.param(
"hdf5",
"https://nisar.asf.earthdatacloud.nasa.gov/NISAR-SAMPLE-DATA/GCOV/ALOS1_Rosamond_20081012/NISAR_L2_PR_GCOV_001_005_A_219_4020_SHNA_A_20081012T060910_20081012T060926_P01101_F_N_J_001.h5",
marks=pytest.mark.skip(reason="often times out"),
),
# https://github.com/zarr-developers/VirtualiZarr/issues/159
# ("hdf5", "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/NEONDSTowerTemperatureData.hdf5"),
Expand Down Expand Up @@ -284,7 +286,7 @@ def test_read_from_url(self, hdf_backend, filetype, url):
vds = open_virtual_dataset(url, indexes={})
assert isinstance(vds, xr.Dataset)

@pytest.mark.xfail(reason="often times out, as nisar file is 200MB")
@pytest.mark.skip(reason="often times out, as nisar file is 200MB")
def test_virtualizarr_vs_local_nisar(self, hdf_backend):
import fsspec

Expand Down
12 changes: 12 additions & 0 deletions virtualizarr/tests/test_readers/test_dmrpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,3 +446,15 @@ def test_relative_path_to_dmrpp_file(self, basic_dmrpp_temp_filepath: Path):
".dmrpp"
)
assert path == expected_datafile_path_uri


@pytest.mark.parametrize("drop_variables", ["mask", ["data", "mask"]])
def test_drop_variables(basic_dmrpp_temp_filepath: Path, drop_variables):
vds = open_virtual_dataset(
str(basic_dmrpp_temp_filepath),
indexes={},
filetype="dmrpp",
drop_variables=drop_variables,
)

assert all(var not in vds for var in drop_variables)
11 changes: 11 additions & 0 deletions virtualizarr/tests/test_readers/test_kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,3 +248,14 @@ def test_notimplemented_read_inline_refs(tmp_path, netcdf4_inlined_ref):
open_virtual_dataset(
filepath=ref_filepath.as_posix(), filetype="kerchunk", indexes={}
)


@pytest.mark.parametrize("drop_variables", ["a", ["a"]])
def test_drop_variables(refs_file_factory, drop_variables):
refs_file = refs_file_factory()

vds = open_virtual_dataset(
refs_file, filetype="kerchunk", drop_variables=drop_variables
)

assert all(var not in vds for var in drop_variables)

0 comments on commit a57c8ce

Please sign in to comment.