-
Notifications
You must be signed in to change notification settings - Fork 30
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Concatenation of virtual datasets fails due to missing Chunk Manager #382
Comments
Hey there @observingClouds, thanks for surfacing all these bugs! It's super helpful. This one felt familiar to #141. I tried adding |
Thanks for your quick response. Sorry that I missed the referenced issue. How do I now load this data? I would like to write this joined virtual dataset to disk as a reference file and load it with xarray, like vds = xr.merge([vds1, vds2], compat='override')
vds.virtualize.to_kerchunk("ref.json", format="json")
xr.open_zarr("reference://", storage_options={'fo':"ref.json"}) This however leads to
Full traceback----> 1 xr.open_zarr("reference://", storage_options={'fo':"ref.json"})
File ~/virtualizarr/lib/python3.13/site-packages/xarray/backends/zarr.py:1491, in open_zarr(store, group, synchronizer, chunks, decode_cf, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, consolidated, overwrite_encoded_chunks, chunk_store, storage_options, decode_timedelta, use_cftime, zarr_version, zarr_format, use_zarr_fill_value_as_mask, chunked_array_type, from_array_kwargs, **kwargs)
1477 raise TypeError(
1478 "open_zarr() got unexpected keyword arguments " + ",".join(kwargs.keys())
1479 )
1481 backend_kwargs = {
1482 "synchronizer": synchronizer,
1483 "consolidated": consolidated,
(...)
1488 "zarr_format": zarr_format,
1489 }
-> 1491 ds = open_dataset(
1492 filename_or_obj=store,
1493 group=group,
1494 decode_cf=decode_cf,
1495 mask_and_scale=mask_and_scale,
1496 decode_times=decode_times,
1497 concat_characters=concat_characters,
1498 decode_coords=decode_coords,
1499 engine="zarr",
1500 chunks=chunks,
1501 drop_variables=drop_variables,
1502 chunked_array_type=chunked_array_type,
1503 from_array_kwargs=from_array_kwargs,
1504 backend_kwargs=backend_kwargs,
1505 decode_timedelta=decode_timedelta,
1506 use_cftime=use_cftime,
1507 zarr_version=zarr_version,
1508 use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask,
1509 )
1510 return ds
File ~/virtualizarr/lib/python3.13/site-packages/xarray/backends/api.py:679, in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, inline_array, chunked_array_type, from_array_kwargs, backend_kwargs, **kwargs)
667 decoders = _resolve_decoders_kwargs(
668 decode_cf,
669 open_backend_dataset_parameters=backend.open_dataset_parameters,
(...)
675 decode_coords=decode_coords,
676 )
678 overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None)
--> 679 backend_ds = backend.open_dataset(
680 filename_or_obj,
681 drop_variables=drop_variables,
682 **decoders,
683 **kwargs,
684 )
685 ds = _dataset_from_backend_dataset(
686 backend_ds,
687 filename_or_obj,
(...)
697 **kwargs,
698 )
699 return ds
File ~/virtualizarr/lib/python3.13/site-packages/xarray/backends/zarr.py:1581, in ZarrBackendEntrypoint.open_dataset(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, group, mode, synchronizer, consolidated, chunk_store, storage_options, zarr_version, zarr_format, store, engine, use_zarr_fill_value_as_mask, cache_members)
1579 store_entrypoint = StoreBackendEntrypoint()
1580 with close_on_error(store):
-> 1581 ds = store_entrypoint.open_dataset(
1582 store,
1583 mask_and_scale=mask_and_scale,
1584 decode_times=decode_times,
1585 concat_characters=concat_characters,
1586 decode_coords=decode_coords,
1587 drop_variables=drop_variables,
1588 use_cftime=use_cftime,
1589 decode_timedelta=decode_timedelta,
1590 )
1591 return ds
File ~/virtualizarr/lib/python3.13/site-packages/xarray/backends/store.py:59, in StoreBackendEntrypoint.open_dataset(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta)
45 encoding = filename_or_obj.get_encoding()
47 vars, attrs, coord_names = conventions.decode_cf_variables(
48 vars,
49 attrs,
(...)
56 decode_timedelta=decode_timedelta,
57 )
---> 59 ds = Dataset(vars, attrs=attrs)
60 ds = ds.set_coords(coord_names.intersection(vars))
61 ds.set_close(filename_or_obj.close)
File ~/virtualizarr/lib/python3.13/site-packages/xarray/core/dataset.py:747, in Dataset.__init__(self, data_vars, coords, attrs)
744 if isinstance(coords, Dataset):
745 coords = coords._variables
--> 747 variables, coord_names, dims, indexes, _ = merge_data_and_coords(
748 data_vars, coords
749 )
751 self._attrs = dict(attrs) if attrs else None
752 self._close = None
File ~/virtualizarr/lib/python3.13/site-packages/xarray/core/dataset.py:460, in merge_data_and_coords(data_vars, coords)
456 coords = create_coords_with_default_indexes(coords, data_vars)
458 # exclude coords from alignment (all variables in a Coordinates object should
459 # already be aligned together) and use coordinates' indexes to align data_vars
--> 460 return merge_core(
461 [data_vars, coords],
462 compat="broadcast_equals",
463 join="outer",
464 explicit_coords=tuple(coords),
465 indexes=coords.xindexes,
466 priority_arg=1,
467 skip_align_args=[1],
468 )
File ~/virtualizarr/lib/python3.13/site-packages/xarray/core/merge.py:699, in merge_core(objects, compat, join, combine_attrs, priority_arg, explicit_coords, indexes, fill_value, skip_align_args)
696 for pos, obj in skip_align_objs:
697 aligned.insert(pos, obj)
--> 699 collected = collect_variables_and_indexes(aligned, indexes=indexes)
700 prioritized = _get_priority_vars_and_indexes(aligned, priority_arg, compat=compat)
701 variables, out_indexes = merge_collected(
702 collected, prioritized, compat=compat, combine_attrs=combine_attrs
703 )
File ~/virtualizarr/lib/python3.13/site-packages/xarray/core/merge.py:362, in collect_variables_and_indexes(list_of_mappings, indexes)
360 append(name, variable, indexes[name])
361 elif variable.dims == (name,):
--> 362 idx, idx_vars = create_default_index_implicit(variable)
363 append_all(idx_vars, {k: idx for k in idx_vars})
364 else:
File ~/virtualizarr/lib/python3.13/site-packages/xarray/core/indexes.py:1425, in create_default_index_implicit(dim_variable, all_variables)
1423 else:
1424 dim_var = {name: dim_variable}
-> 1425 index = PandasIndex.from_variables(dim_var, options={})
1426 index_vars = index.create_variables(dim_var)
1428 return index, index_vars
File ~/virtualizarr/lib/python3.13/site-packages/xarray/core/indexes.py:654, in PandasIndex.from_variables(cls, variables, options)
651 if level is not None:
652 data = var._data.array.get_level_values(level)
--> 654 obj = cls(data, dim, coord_dtype=var.dtype)
655 assert not isinstance(obj.index, pd.MultiIndex)
656 # Rename safely
657 # make a shallow copy: cheap and because the index name may be updated
658 # here or in other constructors (cannot use pd.Index.rename as this
659 # constructor is also called from PandasMultiIndex)
File ~/virtualizarr/lib/python3.13/site-packages/xarray/core/indexes.py:589, in PandasIndex.__init__(self, array, dim, coord_dtype, fastpath)
587 index = array
588 else:
--> 589 index = safe_cast_to_index(array)
591 if index.name is None:
592 # make a shallow copy: cheap and because the index name may be updated
593 # here or in other constructors (cannot use pd.Index.rename as this
594 # constructor is also called from PandasMultiIndex)
595 index = index.copy()
File ~/virtualizarr/lib/python3.13/site-packages/xarray/core/indexes.py:469, in safe_cast_to_index(array)
459 emit_user_level_warning(
460 (
461 "`pandas.Index` does not support the `float16` dtype."
(...)
465 category=DeprecationWarning,
466 )
467 kwargs["dtype"] = "float64"
--> 469 index = pd.Index(np.asarray(array), **kwargs)
471 return _maybe_cast_to_cftimeindex(index)
File ~/virtualizarr/lib/python3.13/site-packages/xarray/core/indexing.py:509, in ExplicitlyIndexed.__array__(self, dtype, copy)
504 def __array__(
505 self, dtype: np.typing.DTypeLike = None, /, *, copy: bool | None = None
506 ) -> np.ndarray:
507 # Leave casting to an array up to the underlying array type.
508 if Version(np.__version__) >= Version("2.0.0"):
--> 509 return np.asarray(self.get_duck_array(), dtype=dtype, copy=copy)
510 else:
511 return np.asarray(self.get_duck_array(), dtype=dtype)
File ~/virtualizarr/lib/python3.13/site-packages/xarray/coding/variables.py:81, in _ElementwiseFunctionArray.get_duck_array(self)
80 def get_duck_array(self):
---> 81 return self.func(self.array.get_duck_array())
File ~/virtualizarr/lib/python3.13/site-packages/xarray/core/indexing.py:652, in LazilyIndexedArray.get_duck_array(self)
648 array = apply_indexer(self.array, self.key)
649 else:
650 # If the array is not an ExplicitlyIndexedNDArrayMixin,
651 # it may wrap a BackendArray so use its __getitem__
--> 652 array = self.array[self.key]
654 # self.array[self.key] is now a numpy array when
655 # self.array is a BackendArray subclass
656 # and self.key is BasicIndexer((slice(None, None, None),))
657 # so we need the explicit check for ExplicitlyIndexed
658 if isinstance(array, ExplicitlyIndexed):
File ~/virtualizarr/lib/python3.13/site-packages/xarray/backends/zarr.py:227, in ZarrArrayWrapper.__getitem__(self, key)
225 elif isinstance(key, indexing.OuterIndexer):
226 method = self._oindex
--> 227 return indexing.explicit_indexing_adapter(
228 key, array.shape, indexing.IndexingSupport.VECTORIZED, method
229 )
File ~/virtualizarr/lib/python3.13/site-packages/xarray/core/indexing.py:1013, in explicit_indexing_adapter(key, shape, indexing_support, raw_indexing_method)
991 """Support explicit indexing by delegating to a raw indexing method.
992
993 Outer and/or vectorized indexers are supported by indexing a second time
(...)
1010 Indexing result, in the form of a duck numpy-array.
1011 """
1012 raw_key, numpy_indices = decompose_indexer(key, shape, indexing_support)
-> 1013 result = raw_indexing_method(raw_key.tuple)
1014 if numpy_indices.tuple:
1015 # index the loaded np.ndarray
1016 indexable = NumpyIndexingAdapter(result)
File ~/virtualizarr/lib/python3.13/site-packages/xarray/backends/zarr.py:217, in ZarrArrayWrapper._getitem(self, key)
216 def _getitem(self, key):
--> 217 return self._array[key]
File ~/virtualizarr/lib/python3.13/site-packages/zarr/core/array.py:2365, in Array.__getitem__(self, selection)
2363 return self.vindex[cast(CoordinateSelection | MaskSelection, selection)]
2364 elif is_pure_orthogonal_indexing(pure_selection, self.ndim):
-> 2365 return self.get_orthogonal_selection(pure_selection, fields=fields)
2366 else:
2367 return self.get_basic_selection(cast(BasicSelection, pure_selection), fields=fields)
File ~/virtualizarr/lib/python3.13/site-packages/zarr/_compat.py:43, in _deprecate_positional_args.<locals>._inner_deprecate_positional_args.<locals>.inner_f(*args, **kwargs)
41 extra_args = len(args) - len(all_args)
42 if extra_args <= 0:
---> 43 return f(*args, **kwargs)
45 # extra_args > 0
46 args_msg = [
47 f"{name}={arg}"
48 for name, arg in zip(kwonly_args[:extra_args], args[-extra_args:], strict=False)
49 ]
File ~/virtualizarr/lib/python3.13/site-packages/zarr/core/array.py:2807, in Array.get_orthogonal_selection(self, selection, out, fields, prototype)
2805 prototype = default_buffer_prototype()
2806 indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid)
-> 2807 return sync(
2808 self._async_array._get_selection(
2809 indexer=indexer, out=out, fields=fields, prototype=prototype
2810 )
2811 )
File ~/virtualizarr/lib/python3.13/site-packages/zarr/core/sync.py:142, in sync(coro, loop, timeout)
139 return_result = next(iter(finished)).result()
141 if isinstance(return_result, BaseException):
--> 142 raise return_result
143 else:
144 return return_result
File ~/virtualizarr/lib/python3.13/site-packages/zarr/core/sync.py:98, in _runner(coro)
93 """
94 Await a coroutine and return the result of running it. If awaiting the coroutine raises an
95 exception, the exception will be returned.
96 """
97 try:
---> 98 return await coro
99 except Exception as ex:
100 return ex
File ~/virtualizarr/lib/python3.13/site-packages/zarr/core/array.py:1230, in AsyncArray._get_selection(self, indexer, prototype, out, fields)
1222 out_buffer = prototype.nd_buffer.create(
1223 shape=indexer.shape,
1224 dtype=out_dtype,
1225 order=self._config.order,
1226 fill_value=self.metadata.fill_value,
1227 )
1228 if product(indexer.shape) > 0:
1229 # reading chunks and decoding them
-> 1230 await self.codec_pipeline.read(
1231 [
1232 (
1233 self.store_path / self.metadata.encode_chunk_key(chunk_coords),
1234 self.metadata.get_chunk_spec(
1235 chunk_coords, self._config, prototype=prototype
1236 ),
1237 chunk_selection,
1238 out_selection,
1239 )
1240 for chunk_coords, chunk_selection, out_selection in indexer
1241 ],
1242 out_buffer,
1243 drop_axes=indexer.drop_axes,
1244 )
1245 return out_buffer.as_ndarray_like()
File ~/virtualizarr/lib/python3.13/site-packages/zarr/core/codec_pipeline.py:453, in BatchedCodecPipeline.read(self, batch_info, out, drop_axes)
447 async def read(
448 self,
449 batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]],
450 out: NDBuffer,
451 drop_axes: tuple[int, ...] = (),
452 ) -> None:
--> 453 await concurrent_map(
454 [
455 (single_batch_info, out, drop_axes)
456 for single_batch_info in batched(batch_info, self.batch_size)
457 ],
458 self.read_batch,
459 config.get("async.concurrency"),
460 )
File ~/virtualizarr/lib/python3.13/site-packages/zarr/core/common.py:68, in concurrent_map(items, func, limit)
65 async with sem:
66 return await func(*item)
---> 68 return await asyncio.gather(*[asyncio.ensure_future(run(item)) for item in items])
File ~/virtualizarr/lib/python3.13/site-packages/zarr/core/common.py:66, in concurrent_map.<locals>.run(item)
64 async def run(item: tuple[Any]) -> V:
65 async with sem:
---> 66 return await func(*item)
File ~/virtualizarr/lib/python3.13/site-packages/zarr/core/codec_pipeline.py:270, in BatchedCodecPipeline.read_batch(self, batch_info, out, drop_axes)
261 else:
262 chunk_bytes_batch = await concurrent_map(
263 [
264 (byte_getter, array_spec.prototype)
(...)
268 config.get("async.concurrency"),
269 )
--> 270 chunk_array_batch = await self.decode_batch(
271 [
272 (chunk_bytes, chunk_spec)
273 for chunk_bytes, (_, chunk_spec, _, _) in zip(
274 chunk_bytes_batch, batch_info, strict=False
275 )
276 ],
277 )
278 for chunk_array, (_, chunk_spec, chunk_selection, out_selection) in zip(
279 chunk_array_batch, batch_info, strict=False
280 ):
281 if chunk_array is not None:
File ~/virtualizarr/lib/python3.13/site-packages/zarr/core/codec_pipeline.py:177, in BatchedCodecPipeline.decode_batch(self, chunk_bytes_and_specs)
172 chunk_bytes_batch = await bb_codec.decode(
173 zip(chunk_bytes_batch, chunk_spec_batch, strict=False)
174 )
176 ab_codec, chunk_spec_batch = ab_codec_with_spec
--> 177 chunk_array_batch = await ab_codec.decode(
178 zip(chunk_bytes_batch, chunk_spec_batch, strict=False)
179 )
181 for aa_codec, chunk_spec_batch in aa_codecs_with_spec[::-1]:
182 chunk_array_batch = await aa_codec.decode(
183 zip(chunk_array_batch, chunk_spec_batch, strict=False)
184 )
File ~/virtualizarr/lib/python3.13/site-packages/zarr/abc/codec.py:129, in BaseCodec.decode(self, chunks_and_specs)
113 async def decode(
114 self,
115 chunks_and_specs: Iterable[tuple[CodecOutput | None, ArraySpec]],
116 ) -> Iterable[CodecInput | None]:
117 """Decodes a batch of chunks.
118 Chunks can be None in which case they are ignored by the codec.
119
(...)
127 Iterable[CodecInput | None]
128 """
--> 129 return await _batching_helper(self._decode_single, chunks_and_specs)
File ~/virtualizarr/lib/python3.13/site-packages/zarr/abc/codec.py:407, in _batching_helper(func, batch_info)
403 async def _batching_helper(
404 func: Callable[[CodecInput, ArraySpec], Awaitable[CodecOutput | None]],
405 batch_info: Iterable[tuple[CodecInput | None, ArraySpec]],
406 ) -> list[CodecOutput | None]:
--> 407 return await concurrent_map(
408 list(batch_info),
409 _noop_for_none(func),
410 config.get("async.concurrency"),
411 )
File ~/virtualizarr/lib/python3.13/site-packages/zarr/core/common.py:68, in concurrent_map(items, func, limit)
65 async with sem:
66 return await func(*item)
---> 68 return await asyncio.gather(*[asyncio.ensure_future(run(item)) for item in items])
File ~/virtualizarr/lib/python3.13/site-packages/zarr/core/common.py:66, in concurrent_map.<locals>.run(item)
64 async def run(item: tuple[Any]) -> V:
65 async with sem:
---> 66 return await func(*item)
File ~/virtualizarr/lib/python3.13/site-packages/zarr/abc/codec.py:420, in _noop_for_none.<locals>.wrap(chunk, chunk_spec)
418 if chunk is None:
419 return None
--> 420 return await func(chunk, chunk_spec)
File ~/virtualizarr/lib/python3.13/site-packages/zarr/codecs/_v2.py:51, in V2Codec._decode_single(self, chunk_bytes, chunk_spec)
49 if chunk_spec.dtype != object:
50 try:
---> 51 chunk = chunk.view(chunk_spec.dtype)
52 except TypeError:
53 # this will happen if the dtype of the chunk
54 # does not match the dtype of the array spec i.g. if
55 # the dtype of the chunk_spec is a string dtype, but the chunk
56 # is an object array. In this case, we need to convert the object
57 # array to the correct dtype.
59 chunk = np.array(chunk).astype(chunk_spec.dtype)
ValueError: When changing to a larger dtype, its size must be a divisor of the total size in bytes of the last axis of the array. It seems like the types (float64 vs int64) are already of after merging (and also different dimension sizes):
|
Np and no worries! There is tons of churn in issues. Hmm, I don't recall seeing that error before. It's not a real solution, but adding from virtualizarr import open_virtual_dataset
## Write references to disk (open_virtual_dataset expects a string)
with open("ref1.json", "w") as f:
json.dump(ref1, f)
with open("ref2.json", "w") as f:
json.dump(ref2, f)
# Note this section requires the modification in #381
vds1 = open_virtual_dataset("ref1.json", filetype='kerchunk', loadable_variables=['x','y'])
vds2 = open_virtual_dataset("ref2.json", filetype='kerchunk', loadable_variables=['x','y'])
combined_vds = xr.merge([vds1, vds2],compat='override')
combined_vds.virtualize.to_kerchunk('combined.parquet', format='parquet')
vds = xr.open_dataset("combined.parquet", engine='kerchunk')
Kind of stepping back a bit, and while it would be good to figure out what is going wrong here, would writing to Icechunk instead of one of the legacy Kerchunk formats make sense for your use case? |
Thanks @norlandrhagen for troubleshooting. I will need to read up on Icechunk. In the meantime, please note that the dimensions are still different to the one when using "real" datasets: Dimensions: (x: 2, y: 2) vs. Dimensions: (x: 4, y: 2) |
I'll try to look at this properly soon and run your example, but I suspect you might need |
yes this is pydata/xarray#8778 yet again |
Okay, I am not getting the correct results yet. I tried now with >>> xr.combine_by_coords([vds1,vds2])
ValueError: Every dimension requires a corresponding 1D coordinate and index for inferring concatenation order but the coordinate 'x' has no corresponding index Full traceback---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[21], line 1
----> 1 xr.combine_by_coords([vds1,vds2])
File /etc/ecmwf/nfs/dh2_home_b/dnk9255/envs/virtual/lib/python3.13/site-packages/xarray/core/combine.py:973, in combine_by_coords(data_objects, compat, data_vars, coords, fill_value, join, combine_attrs)
969 grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys)
971 # Perform the multidimensional combine on each group of data variables
972 # before merging back together
--> 973 concatenated_grouped_by_data_vars = tuple(
974 _combine_single_variable_hypercube(
975 tuple(datasets_with_same_vars),
976 fill_value=fill_value,
977 data_vars=data_vars,
978 coords=coords,
979 compat=compat,
980 join=join,
981 combine_attrs=combine_attrs,
982 )
983 for vars, datasets_with_same_vars in grouped_by_vars
984 )
986 return merge(
987 concatenated_grouped_by_data_vars,
988 compat=compat,
(...)
991 combine_attrs=combine_attrs,
992 )
File /etc/ecmwf/nfs/dh2_home_b/dnk9255/envs/virtual/lib/python3.13/site-packages/xarray/core/combine.py:974, in <genexpr>(.0)
969 grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys)
971 # Perform the multidimensional combine on each group of data variables
972 # before merging back together
973 concatenated_grouped_by_data_vars = tuple(
--> 974 _combine_single_variable_hypercube(
975 tuple(datasets_with_same_vars),
976 fill_value=fill_value,
977 data_vars=data_vars,
978 coords=coords,
979 compat=compat,
980 join=join,
981 combine_attrs=combine_attrs,
982 )
983 for vars, datasets_with_same_vars in grouped_by_vars
984 )
986 return merge(
987 concatenated_grouped_by_data_vars,
988 compat=compat,
(...)
991 combine_attrs=combine_attrs,
992 )
File /etc/ecmwf/nfs/dh2_home_b/dnk9255/envs/virtual/lib/python3.13/site-packages/xarray/core/combine.py:634, in _combine_single_variable_hypercube(datasets, fill_value, data_vars, coords, compat, join, combine_attrs)
628 if len(datasets) == 0:
629 raise ValueError(
630 "At least one Dataset is required to resolve variable names "
631 "for combined hypercube."
632 )
--> 634 combined_ids, concat_dims = _infer_concat_order_from_coords(list(datasets))
636 if fill_value is None:
637 # check that datasets form complete hypercube
638 _check_shape_tile_ids(combined_ids)
File /etc/ecmwf/nfs/dh2_home_b/dnk9255/envs/virtual/lib/python3.13/site-packages/xarray/core/combine.py:109, in _infer_concat_order_from_coords(datasets)
103 if any(index is None for index in indexes):
104 error_msg = (
105 f"Every dimension requires a corresponding 1D coordinate "
106 f"and index for inferring concatenation order but the "
107 f"coordinate '{dim}' has no corresponding index"
108 )
--> 109 raise ValueError(error_msg)
111 # TODO (benbovy, flexible indexes): support flexible indexes?
112 indexes = [index.to_pandas_index() for index in indexes]
ValueError: Every dimension requires a corresponding 1D coordinate and index for inferring concatenation order but the coordinate 'x' has no corresponding index
In [22]: combined_vds = xr.merge([vds1, vds2],compat='override', join='override', coords='minimal')
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[22], line 1
----> 1 combined_vds = xr.merge([vds1, vds2],compat='override', join='override', coords='minimal')
TypeError: merge() got an unexpected keyword argument 'coords'
In [23]: xr.combine_by_coords([vds1,vds2])
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[23], line 1
----> 1 xr.combine_by_coords([vds1,vds2])
File /etc/ecmwf/nfs/dh2_home_b/dnk9255/envs/virtual/lib/python3.13/site-packages/xarray/core/combine.py:973, in combine_by_coords(data_objects, compat, data_vars, coords, fill_value, join, combine_attrs)
969 grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys)
971 # Perform the multidimensional combine on each group of data variables
972 # before merging back together
--> 973 concatenated_grouped_by_data_vars = tuple(
974 _combine_single_variable_hypercube(
975 tuple(datasets_with_same_vars),
976 fill_value=fill_value,
977 data_vars=data_vars,
978 coords=coords,
979 compat=compat,
980 join=join,
981 combine_attrs=combine_attrs,
982 )
983 for vars, datasets_with_same_vars in grouped_by_vars
984 )
986 return merge(
987 concatenated_grouped_by_data_vars,
988 compat=compat,
(...)
991 combine_attrs=combine_attrs,
992 )
File /etc/ecmwf/nfs/dh2_home_b/dnk9255/envs/virtual/lib/python3.13/site-packages/xarray/core/combine.py:974, in <genexpr>(.0)
969 grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys)
971 # Perform the multidimensional combine on each group of data variables
972 # before merging back together
973 concatenated_grouped_by_data_vars = tuple(
--> 974 _combine_single_variable_hypercube(
975 tuple(datasets_with_same_vars),
976 fill_value=fill_value,
977 data_vars=data_vars,
978 coords=coords,
979 compat=compat,
980 join=join,
981 combine_attrs=combine_attrs,
982 )
983 for vars, datasets_with_same_vars in grouped_by_vars
984 )
986 return merge(
987 concatenated_grouped_by_data_vars,
988 compat=compat,
(...)
991 combine_attrs=combine_attrs,
992 )
File /etc/ecmwf/nfs/dh2_home_b/dnk9255/envs/virtual/lib/python3.13/site-packages/xarray/core/combine.py:634, in _combine_single_variable_hypercube(datasets, fill_value, data_vars, coords, compat, join, combine_attrs)
628 if len(datasets) == 0:
629 raise ValueError(
630 "At least one Dataset is required to resolve variable names "
631 "for combined hypercube."
632 )
--> 634 combined_ids, concat_dims = _infer_concat_order_from_coords(list(datasets))
636 if fill_value is None:
637 # check that datasets form complete hypercube
638 _check_shape_tile_ids(combined_ids)
File /etc/ecmwf/nfs/dh2_home_b/dnk9255/envs/virtual/lib/python3.13/site-packages/xarray/core/combine.py:109, in _infer_concat_order_from_coords(datasets)
103 if any(index is None for index in indexes):
104 error_msg = (
105 f"Every dimension requires a corresponding 1D coordinate "
106 f"and index for inferring concatenation order but the "
107 f"coordinate '{dim}' has no corresponding index"
108 )
--> 109 raise ValueError(error_msg)
111 # TODO (benbovy, flexible indexes): support flexible indexes?
112 indexes = [index.to_pandas_index() for index in indexes]
ValueError: Every dimension requires a corresponding 1D coordinate and index for inferring concatenation order but the coordinate 'x' has no corresponding index However, for the "real" dataset it works as expected: >>> xr.combine_by_coords([ds1,ds2])
<xarray.Dataset> Size: 304B
Dimensions: (x: 4, y: 2)
Coordinates:
* x (x) int64 32B 1 2 10 20
* y (y) int64 16B 1 2
Data variables:
a (x, y) float64 64B nan nan nan nan 1.0 2.0 3.0 4.0
b (x, y) float64 64B nan nan nan nan 10.0 20.0 30.0 40.0
c (x, y) float64 64B 5.0 6.0 7.0 8.0 nan nan nan nan
d (x, y) float64 64B 50.0 60.0 70.0 80.0 nan nan nan nan
Attributes:
coordinates: x y |
You need to pass all the same keyword arguments to |
Why should I do >>> xr.combine_by_coords([ds1, ds2], join='override', coords='minimal')
<xarray.Dataset> Size: 160B
Dimensions: (x: 2, y: 2)
Coordinates:
* x (x) int64 16B 10 20
* y (y) int64 16B 1 2
Data variables:
a (x, y) int64 32B 1 2 3 4
b (x, y) int64 32B 10 20 30 40
c (x, y) int64 32B 5 6 7 8
d (x, y) int64 32B 50 60 70 80
Attributes:
coordinates: x y vs >>> xr.combine_by_coords([ds1, ds2], join='outer')
<xarray.Dataset> Size: 304B
Dimensions: (x: 4, y: 2)
Coordinates:
* x (x) int64 32B 1 2 10 20
* y (y) int64 16B 1 2
Data variables:
a (x, y) float64 64B nan nan nan nan 1.0 2.0 3.0 4.0
b (x, y) float64 64B nan nan nan nan 10.0 20.0 30.0 40.0
c (x, y) float64 64B 5.0 6.0 7.0 8.0 nan nan nan nan
d (x, y) float64 64B 50.0 60.0 70.0 80.0 nan nan nan nan
Attributes:
coordinates: x y |
This was fixed by #396 |
Thanks for referencing @maxrjones! Providing the loadable variables with vds1 = open_virtual_dataset("ref1.json", filetype='kerchunk', loadable_variables=['x','y']) results in a NotImplementedError now (version: TracebackFile ~/Documents/GitProjects/venv/virtuali/lib/python3.13/site-packages/virtualizarr/backend.py:203, in open_virtual_dataset(filepath, filetype, group, drop_variables, loadable_variables, decode_times, cftime_variables, indexes, virtual_array_class, virtual_backend_kwargs, reader_options, backend)
200 if backend_cls is None:
201 raise NotImplementedError(f"Unsupported file type: {filetype.name}")
--> 203 vds = backend_cls.open_virtual_dataset(
204 filepath,
205 group=group,
206 drop_variables=drop_variables,
207 loadable_variables=loadable_variables,
208 decode_times=decode_times,
209 indexes=indexes,
210 virtual_backend_kwargs=virtual_backend_kwargs,
211 reader_options=reader_options,
212 )
214 return vds
File ~/Documents/GitProjects/venv/virtuali/lib/python3.13/site-packages/virtualizarr/readers/kerchunk.py:47, in KerchunkVirtualBackend.open_virtual_dataset(filepath, group, drop_variables, loadable_variables, decode_times, indexes, virtual_backend_kwargs, reader_options)
41 drop_variables, loadable_variables = check_for_collisions(
42 drop_variables=drop_variables,
43 loadable_variables=loadable_variables,
44 )
46 if loadable_variables or indexes or decode_times:
---> 47 raise NotImplementedError()
49 fs = _FsspecFSFromFilepath(filepath=filepath, reader_options=reader_options)
51 # The kerchunk .parquet storage format isn't actually a parquet, but a directory that contains named parquets for each group/variable.
NotImplementedError: EDIT: I thought it works now, but I used the actual dataset instead of the virtual one... |
I tried to set the indices explicitly now with: >>> vds1['x']= ds1.x
>>> vds1['y']= ds1.y
>>> vds2['x'] = ds2.x
>>> vds2['y'] = ds2.y
>>> vds1
<xarray.Dataset> Size: 96B
Dimensions: (x: 2, y: 2)
Coordinates:
* x (x) int64 16B 10 20
* y (y) int64 16B 1 2
Data variables:
a (x, y) int64 32B ManifestArray<shape=(2, 2), dtype=int64, chunks...
b (x, y) int64 32B ManifestArray<shape=(2, 2), dtype=int64, chunks...
xr.combine_by_coords([vds1,vds2], join='outer') This results in
Traceback---------------------------------------------------------------------------
NotImplementedError Traceback (most recent call last)
Cell In[55], line 1
----> 1 xr.combine_by_coords([vds1,vds2], join='outer')
File ~/Documents/GitProjects/venv/virtuali/lib/python3.13/site-packages/xarray/core/combine.py:986, in combine_by_coords(data_objects, compat, data_vars, coords, fill_value, join, combine_attrs)
971 # Perform the multidimensional combine on each group of data variables
972 # before merging back together
973 concatenated_grouped_by_data_vars = tuple(
974 _combine_single_variable_hypercube(
975 tuple(datasets_with_same_vars),
(...)
983 for vars, datasets_with_same_vars in grouped_by_vars
984 )
--> 986 return merge(
987 concatenated_grouped_by_data_vars,
988 compat=compat,
989 fill_value=fill_value,
990 join=join,
991 combine_attrs=combine_attrs,
992 )
File ~/Documents/GitProjects/venv/virtuali/lib/python3.13/site-packages/xarray/core/merge.py:976, in merge(objects, compat, join, fill_value, combine_attrs)
973 obj = obj.to_dataset()
974 dict_like_objects.append(obj)
--> 976 merge_result = merge_core(
977 dict_like_objects,
978 compat,
979 join,
980 combine_attrs=combine_attrs,
981 fill_value=fill_value,
982 )
983 return Dataset._construct_direct(**merge_result._asdict())
File ~/Documents/GitProjects/venv/virtuali/lib/python3.13/site-packages/xarray/core/merge.py:692, in merge_core(objects, compat, join, combine_attrs, priority_arg, explicit_coords, indexes, fill_value, skip_align_args)
689 skip_align_objs = [(pos, objects.pop(pos)) for pos in skip_align_args]
691 coerced = coerce_pandas_values(objects)
--> 692 aligned = deep_align(
693 coerced, join=join, copy=False, indexes=indexes, fill_value=fill_value
694 )
696 for pos, obj in skip_align_objs:
697 aligned.insert(pos, obj)
File ~/Documents/GitProjects/venv/virtuali/lib/python3.13/site-packages/xarray/core/alignment.py:947, in deep_align(objects, join, copy, indexes, exclude, raise_on_invalid, fill_value)
944 else:
945 out.append(variables)
--> 947 aligned = align(
948 *targets,
949 join=join,
950 copy=copy,
951 indexes=indexes,
952 exclude=exclude,
953 fill_value=fill_value,
954 )
956 for position, key, aligned_obj in zip(positions, keys, aligned, strict=True):
957 if key is no_key:
File ~/Documents/GitProjects/venv/virtuali/lib/python3.13/site-packages/xarray/core/alignment.py:883, in align(join, copy, indexes, exclude, fill_value, *objects)
687 """
688 Given any number of Dataset and/or DataArray objects, returns new
689 objects with aligned indexes and dimension sizes.
(...)
873
874 """
875 aligner = Aligner(
876 objects,
877 join=join,
(...)
881 fill_value=fill_value,
882 )
--> 883 aligner.align()
884 return aligner.results
File ~/Documents/GitProjects/venv/virtuali/lib/python3.13/site-packages/xarray/core/alignment.py:583, in Aligner.align(self)
581 self.results = self.objects
582 else:
--> 583 self.reindex_all()
File ~/Documents/GitProjects/venv/virtuali/lib/python3.13/site-packages/xarray/core/alignment.py:558, in Aligner.reindex_all(self)
557 def reindex_all(self) -> None:
--> 558 self.results = tuple(
559 self._reindex_one(obj, matching_indexes)
560 for obj, matching_indexes in zip(
561 self.objects, self.objects_matching_indexes, strict=True
562 )
563 )
File ~/Documents/GitProjects/venv/virtuali/lib/python3.13/site-packages/xarray/core/alignment.py:559, in <genexpr>(.0)
557 def reindex_all(self) -> None:
558 self.results = tuple(
--> 559 self._reindex_one(obj, matching_indexes)
560 for obj, matching_indexes in zip(
561 self.objects, self.objects_matching_indexes, strict=True
562 )
563 )
File ~/Documents/GitProjects/venv/virtuali/lib/python3.13/site-packages/xarray/core/alignment.py:547, in Aligner._reindex_one(self, obj, matching_indexes)
544 new_indexes, new_variables = self._get_indexes_and_vars(obj, matching_indexes)
545 dim_pos_indexers = self._get_dim_pos_indexers(matching_indexes)
--> 547 return obj._reindex_callback(
548 self,
549 dim_pos_indexers,
550 new_variables,
551 new_indexes,
552 self.fill_value,
553 self.exclude_dims,
554 self.exclude_vars,
555 )
File ~/Documents/GitProjects/venv/virtuali/lib/python3.13/site-packages/xarray/core/dataset.py:3634, in Dataset._reindex_callback(self, aligner, dim_pos_indexers, variables, indexes, fill_value, exclude_dims, exclude_vars)
3628 else:
3629 to_reindex = {
3630 k: v
3631 for k, v in self.variables.items()
3632 if k not in variables and k not in exclude_vars
3633 }
-> 3634 reindexed_vars = alignment.reindex_variables(
3635 to_reindex,
3636 dim_pos_indexers,
3637 copy=aligner.copy,
3638 fill_value=fill_value,
3639 sparse=aligner.sparse,
3640 )
3641 new_variables.update(reindexed_vars)
3642 new_coord_names = self._coord_names | set(new_indexes)
File ~/Documents/GitProjects/venv/virtuali/lib/python3.13/site-packages/xarray/core/alignment.py:79, in reindex_variables(variables, dim_pos_indexers, copy, fill_value, sparse)
76 needs_masking = any(d in masked_dims for d in var.dims)
78 if needs_masking:
---> 79 new_var = var._getitem_with_mask(indxr, fill_value=fill_value_)
80 elif all(is_full_slice(k) for k in indxr):
81 # no reindexing necessary
82 # here we need to manually deal with copying data, since
83 # we neither created a new ndarray nor used fancy indexing
84 new_var = var.copy(deep=copy)
File ~/Documents/GitProjects/venv/virtuali/lib/python3.13/site-packages/xarray/core/variable.py:863, in Variable._getitem_with_mask(self, key, fill_value)
860 actual_indexer = indexer
862 indexable = as_indexable(self._data)
--> 863 data = indexing.apply_indexer(indexable, actual_indexer)
865 mask = indexing.create_mask(indexer, self.shape, data)
866 # we need to invert the mask in order to pass data first. This helps
867 # pint to choose the correct unit
868 # TODO: revert after https://github.com/hgrecco/pint/issues/1019 is fixed
File ~/Documents/GitProjects/venv/virtuali/lib/python3.13/site-packages/xarray/core/indexing.py:1031, in apply_indexer(indexable, indexer)
1029 return indexable.vindex[indexer]
1030 elif isinstance(indexer, OuterIndexer):
-> 1031 return indexable.oindex[indexer]
1032 else:
1033 return indexable[indexer]
File ~/Documents/GitProjects/venv/virtuali/lib/python3.13/site-packages/xarray/core/indexing.py:369, in IndexCallable.__getitem__(self, key)
368 def __getitem__(self, key: Any) -> Any:
--> 369 return self.getter(key)
File ~/Documents/GitProjects/venv/virtuali/lib/python3.13/site-packages/xarray/core/indexing.py:1508, in NumpyIndexingAdapter._oindex_get(self, indexer)
1506 def _oindex_get(self, indexer: OuterIndexer):
1507 key = _outer_to_numpy_indexer(indexer, self.array.shape)
-> 1508 return self.array[key]
File ~/Documents/GitProjects/venv/virtuali/lib/python3.13/site-packages/virtualizarr/manifests/array.py:226, in ManifestArray.__getitem__(self, key)
224 return self
225 else:
--> 226 raise NotImplementedError(f"Doesn't support slicing with {indexer}")
NotImplementedError: Doesn't support slicing with (array([-1, -1, 0, 1]), slice(None, None, None)) |
What I did
I created kerchunk reference files following #381 and tried to combine them via
xr.merge
.What happened
What I expected
I expected the virtual equivalent to the "real" datasets to return:
Environment
vz.version = 1.2.1.dev19+g0d2d6ab
The text was updated successfully, but these errors were encountered: