Skip to content

Commit

Permalink
Sc 50 disk backed operations (#93)
Browse files Browse the repository at this point in the history
* SC_50 disk backed operations

* SC_50 update comment

* SC50 disk backed operations, reload
  • Loading branch information
ArneDefauw authored Feb 11, 2025
1 parent 9afe814 commit 72c603a
Show file tree
Hide file tree
Showing 7 changed files with 51 additions and 53 deletions.
23 changes: 14 additions & 9 deletions src/harpy/image/_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

from harpy.utils._io import _incremental_io_on_disk
from harpy.utils.pylogger import get_pylogger
from harpy.utils.utils import _self_contained_warning_message

log = get_pylogger(__name__)

Expand Down Expand Up @@ -141,7 +140,9 @@ def add_to_sdata(
if output_layer in [*sdata.images]:
if sdata.is_backed():
if overwrite:
sdata = _incremental_io_on_disk(sdata, output_layer=output_layer, element=spatial_element)
sdata = _incremental_io_on_disk(
sdata, output_layer=output_layer, element=spatial_element, element_type="images"
)
else:
raise ValueError(
f"Attempting to overwrite 'sdata.images[\"{output_layer}\"]', but overwrite is set to False. Set overwrite to True to overwrite the .zarr store."
Expand All @@ -153,9 +154,10 @@ def add_to_sdata(
sdata[output_layer] = spatial_element
if sdata.is_backed():
sdata.write_element(output_layer)
if warning_message := _self_contained_warning_message(sdata, output_layer):
log.warning(warning_message)
sdata = read_zarr(sdata.path)
sdata_temp = read_zarr(sdata.path, selection=["images"])
del sdata[output_layer]
sdata[output_layer] = sdata_temp[output_layer]
del sdata_temp

return sdata

Expand Down Expand Up @@ -209,7 +211,9 @@ def add_to_sdata(
if output_layer in [*sdata.labels]:
if sdata.is_backed():
if overwrite:
sdata = _incremental_io_on_disk(sdata, output_layer=output_layer, element=spatial_element)
sdata = _incremental_io_on_disk(
sdata, output_layer=output_layer, element=spatial_element, element_type="labels"
)
else:
raise ValueError(
f"Attempting to overwrite 'sdata.labels[\"{output_layer}\"]', but overwrite is set to False. Set overwrite to True to overwrite the .zarr store."
Expand All @@ -220,9 +224,10 @@ def add_to_sdata(
sdata[output_layer] = spatial_element
if sdata.is_backed():
sdata.write_element(output_layer)
if warning_message := _self_contained_warning_message(sdata, output_layer):
log.warning(warning_message)
sdata = read_zarr(sdata.path)
sdata_temp = read_zarr(sdata.path, selection=["labels"])
del sdata[output_layer]
sdata[output_layer] = sdata_temp[output_layer]
del sdata_temp

return sdata

Expand Down
9 changes: 5 additions & 4 deletions src/harpy/image/pixel_clustering/_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from harpy.image._image import _get_spatial_element, add_labels_layer
from harpy.utils._keys import _INSTANCE_KEY, _REGION_KEY, _SPATIAL, ClusteringKey
from harpy.utils.pylogger import get_pylogger
from harpy.utils.utils import _get_uint_dtype, _self_contained_warning_message
from harpy.utils.utils import _get_uint_dtype

log = get_pylogger(__name__)

Expand Down Expand Up @@ -223,9 +223,10 @@ def _fix_name(layer: str | Iterable[str]):
_labels_flowsom_name = f"labels_flowsom_{uuid.uuid4()}"
sdata.images[_labels_flowsom_name] = se_intermediate
sdata.write_element(_labels_flowsom_name)
if warning_message := _self_contained_warning_message(sdata, _labels_flowsom_name):
log.warning(warning_message)
sdata = read_zarr(sdata.path)
sdata_temp = read_zarr(sdata.path, selection=["images"])
del sdata[_labels_flowsom_name]
sdata[_labels_flowsom_name] = sdata_temp[_labels_flowsom_name]
del sdata_temp
_labels_flowsom = _get_spatial_element(sdata, layer=_labels_flowsom_name).data
else:
_labels_flowsom = _labels_flowsom.persist()
Expand Down
22 changes: 5 additions & 17 deletions src/harpy/points/_points.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from harpy.utils._io import _incremental_io_on_disk
from harpy.utils.pylogger import get_pylogger
from harpy.utils.utils import _self_contained_warning_message

log = get_pylogger(__name__)

Expand Down Expand Up @@ -53,7 +52,7 @@ def add_points_layer(
if output_layer in [*sdata.points]:
if sdata.is_backed():
if overwrite:
sdata = _incremental_io_on_disk(sdata, output_layer=output_layer, element=points)
sdata = _incremental_io_on_disk(sdata, output_layer=output_layer, element=points, element_type="points")
else:
raise ValueError(
f"Attempting to overwrite 'sdata.points[\"{output_layer}\"]', but overwrite is set to False. Set overwrite to True to overwrite the .zarr store."
Expand All @@ -65,19 +64,8 @@ def add_points_layer(
sdata[output_layer] = points
if sdata.is_backed():
sdata.write_element(output_layer)
if warning_message := _self_contained_warning_message(sdata, output_layer):
log.warning(warning_message)
if len(sdata[output_layer].dask) != 1:
log.warning(
f"Element '{output_layer}' is Dask-backed, but not all tasks have been executed in the in-memory SpatialData object.\n"
"This may lead to unexpected behavior if lazy computations are not triggered.\n"
"To resolve this, ensure that you assign the result of operations:\n"
" sdata = harpy.{operation}(sdata, ...)\n"
"Alternatively, manually reload from the Zarr store:\n"
f" sdata = spatialdata.read_zarr('{sdata.path}')\n"
"For more details, see the discussion at:\n"
" https://github.com/saeyslab/harpy/issues/90"
)
sdata = read_zarr(sdata.path)

sdata_temp = read_zarr(sdata.path, selection=["points"])
del sdata[output_layer]
sdata[output_layer] = sdata_temp[output_layer]
del sdata_temp
return sdata
13 changes: 7 additions & 6 deletions src/harpy/shape/_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
from harpy.utils._io import _incremental_io_on_disk
from harpy.utils._keys import _INSTANCE_KEY, _REGION_KEY
from harpy.utils.pylogger import get_pylogger
from harpy.utils.utils import _self_contained_warning_message

log = get_pylogger(__name__)

Expand Down Expand Up @@ -195,7 +194,9 @@ def add_to_sdata(
if output_layer in [*sdata.shapes]:
if sdata.is_backed():
if overwrite:
sdata = _incremental_io_on_disk(sdata, output_layer=output_layer, element=spatial_element)
sdata = _incremental_io_on_disk(
sdata, output_layer=output_layer, element=spatial_element, element_type="shapes"
)
else:
raise ValueError(
f"Attempting to overwrite 'sdata.shapes[\"{output_layer}\"]', but overwrite is set to False. Set overwrite to True to overwrite the .zarr store."
Expand All @@ -206,10 +207,10 @@ def add_to_sdata(
sdata[output_layer] = spatial_element
if sdata.is_backed():
sdata.write_element(output_layer)
# Note: currently shapes are in memory, and the latter warning will never be triggered.
if warning_message := _self_contained_warning_message(sdata, output_layer):
log.warning(warning_message)
sdata = read_zarr(sdata.path)
sdata_temp = read_zarr(sdata.path, selection=["shapes"])
del sdata[output_layer]
sdata[output_layer] = sdata_temp[output_layer]
del sdata_temp

return sdata

Expand Down
14 changes: 7 additions & 7 deletions src/harpy/table/_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from harpy.utils._io import _incremental_io_on_disk
from harpy.utils._keys import _INSTANCE_KEY, _REGION_KEY
from harpy.utils.pylogger import get_pylogger
from harpy.utils.utils import _self_contained_warning_message

log = get_pylogger(__name__)

Expand Down Expand Up @@ -46,11 +45,12 @@ def add_table(
adata,
)

# given an adata.
if output_layer in [*sdata.tables]:
if sdata.is_backed():
if overwrite:
sdata = _incremental_io_on_disk(sdata, output_layer=output_layer, element=adata)
sdata = _incremental_io_on_disk(
sdata, output_layer=output_layer, element=adata, element_type="tables"
)
else:
raise ValueError(
f"Attempting to overwrite 'sdata.tables[\"{output_layer}\"]', but overwrite is set to False. Set overwrite to True to overwrite the .zarr store."
Expand All @@ -61,9 +61,9 @@ def add_table(
sdata[output_layer] = adata
if sdata.is_backed():
sdata.write_element(output_layer)
# Note: currently tables are in memory, and the latter warning will never be triggered.
if warning_message := _self_contained_warning_message(sdata, output_layer):
log.warning(warning_message)
sdata = read_zarr(sdata.path)
sdata_temp = read_zarr(sdata.path, selection=["tables"])
del sdata[output_layer]
sdata[output_layer] = sdata_temp[output_layer]
del sdata_temp

return sdata
20 changes: 13 additions & 7 deletions src/harpy/utils/_io.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import uuid
from typing import Literal

from anndata import AnnData
from dask.dataframe import DataFrame
Expand All @@ -9,7 +10,6 @@
from xarray import DataArray, DataTree

from harpy.utils.pylogger import get_pylogger
from harpy.utils.utils import _self_contained_warning_message

log = get_pylogger(__name__)

Expand All @@ -18,7 +18,11 @@ def _incremental_io_on_disk(
sdata: SpatialData,
output_layer: str,
element: DataArray | DataTree | DataFrame | GeoDataFrame | AnnData,
element_type: str = Literal["images", "labels", "shapes", "tables", "points"],
) -> SpatialData:
assert element_type in ["images", "labels", "shapes", "tables", "points"], (
"'element_type' should be one of [ 'images', 'labels', 'shapes', 'tables', 'points' ]"
)
new_output_layer = f"{output_layer}_{uuid.uuid4()}"
# a. write a backup copy of the data
sdata[new_output_layer] = element
Expand All @@ -33,19 +37,21 @@ def _incremental_io_on_disk(
del sdata[new_output_layer]
del sdata[output_layer]
# a3 load the backup copy into memory
sdata_copy = read_zarr(sdata.path)
sdata_copy = read_zarr(sdata.path, selection=[element_type])
# b1. rewrite the original data
sdata.delete_element_from_disk(output_layer)
sdata[output_layer] = sdata_copy[new_output_layer]
log.warning(f"layer with name '{output_layer}' already exists. Overwriting...")
sdata.write_element(output_layer)
# b2. reload the new data into memory (because it has been written but in-memory it still points
# from the backup location)
if warning_message := _self_contained_warning_message(sdata, output_layer):
log.warning(warning_message)
sdata = read_zarr(sdata.path)
sdata_materialized = read_zarr(sdata.path, selection=[element_type])
# to make sdata point to layer that is materialized, and keep object id.
del sdata[output_layer]
sdata[output_layer] = sdata_materialized[output_layer]
# c. remove the backup copy
del sdata[new_output_layer]
sdata.delete_element_from_disk(new_output_layer)
del sdata_materialized[new_output_layer]
sdata_materialized.delete_element_from_disk(new_output_layer)
del sdata_materialized

return sdata
3 changes: 0 additions & 3 deletions src/harpy/utils/_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from harpy.table._table import add_table_layer
from harpy.utils._keys import _INSTANCE_KEY, _REGION_KEY
from harpy.utils.pylogger import get_pylogger
from harpy.utils.utils import _self_contained_warning_message

log = get_pylogger(__name__)

Expand Down Expand Up @@ -173,8 +172,6 @@ def _fix_name(name: str | Iterable[str]):
sdata_queried[_layer_to_copy] = sdata[_layer_to_copy]
if sdata_queried.is_backed():
sdata_queried.write_element(_layer_to_copy)
if warning_message := _self_contained_warning_message(sdata, _layer_to_copy):
log.warning(warning_message)

# if backed, and if there were layers copied, we read back from zarr, otherwise sdata_queried not self contained
if sdata_queried.is_backed() and layers_to_copy:
Expand Down

0 comments on commit 72c603a

Please sign in to comment.