From ae0711da53684d8dc558071ed45f9c1099b36f2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Paul=20M=C3=BCller?= Date: Fri, 5 Jan 2024 14:18:47 +0100 Subject: [PATCH] enh: favor array operations in `yield_filtered_array_stacks` (~4x speed-up) --- CHANGELOG | 3 +- dclab/rtdc_dataset/export.py | 60 ++++++++++++++++++++++--------- tests/test_export.py | 69 +++++++++++++++++++++++++++++++++++- tests/test_rtdc_fmt_hdf5.py | 2 +- 4 files changed, 114 insertions(+), 20 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 43bbca54..df0b5493 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,4 @@ -0.56.4 +0.57.0 - fix: integer overflow in downsample_grid - fix: removed unnecessary computation of hierarchy filter instance - enh: cythonize downsample_grid (~10x speed-up) @@ -9,6 +9,7 @@ - enh: globally use chunk sizes of ~1MiB when writing HDF5 data (minor speedup since previously a chunk size of 100 events was used for images and scalar features were written in one big chunk) + - enh: favor array operations in `yield_filtered_array_stacks` (~4x speed-up) - ref: implement `__array__` methods for hierarchy event classes - ref: implement `__array__` method for `H5MaskEvent` - ref: new submodule for hierarchy format diff --git a/dclab/rtdc_dataset/export.py b/dclab/rtdc_dataset/export.py index 1decdde5..ed25070e 100644 --- a/dclab/rtdc_dataset/export.py +++ b/dclab/rtdc_dataset/export.py @@ -397,10 +397,12 @@ def yield_filtered_array_stacks(data, indices): Parameters ---------- data: np.ndarray or h5py.Dataset - The full, unfiltered input feature data. + The full, unfiltered input feature data. Must implement + the `shape` and `dtype` properties. If it implements the + `__array__` method, fast slicing is used. indices: np.ndarray or list - The indices in data (first axis) that should be written - to the chunks returned by this generator. + The indices (integer values) for `data` (first axis), indicating + which elements should be returned by this generator. Notes ----- @@ -410,24 +412,48 @@ def yield_filtered_array_stacks(data, indices): and that the events in `data` all have the same shape. The dtype of the returned chunks is determined by the first item in `data`. + + This method works with sliceable (e.g. np.ndarray) and + non-sliceable (e.g. tdms-format-based images) input data. If the + input data is sliceable (which is determined by the availability + of the `__array__` method, then fast numpy sclicing is used. If the + input data does not support slicing (`__array__` not defined), then + a slow iteration over `indices` is done. + + In the slow iteration case, the returned array data are overridden + in-place. If you need to retain a copy of the `yield`ed chunks, + apply `np.array(.., copy=True)` to the returned chunks. """ chunk_shape = RTDCWriter.get_best_nd_chunks(item_shape=data.shape[1:], item_dtype=data.dtype) chunk_size = chunk_shape[0] - # assemble filtered image stacks - chunk = np.zeros(chunk_shape, dtype=data.dtype) - - jj = 0 - for ii in indices: - chunk[jj] = data[ii] - if (jj + 1) % chunk_size == 0: - jj = 0 - yield chunk - else: - jj += 1 - # yield remainder - if jj: - yield chunk[:jj] + + if hasattr(data, "__array__"): + # We have an array-like object and can do slicing with the indexing + # array. This speeds up chunk creation for e.g. the HDF5 file format + # where all data are present in an array-like fashion. + indices = np.array(indices) + stop = 0 + for kk in range(len(indices) // chunk_size): + start = chunk_size * kk + stop = chunk_size * (kk + 1) + yield data[indices[start:stop]] + if stop < len(indices): + yield data[indices[stop:]] + else: + # assemble filtered image stacks + chunk = np.zeros(chunk_shape, dtype=data.dtype) + jj = 0 + for ii in indices: + chunk[jj] = data[ii] + if (jj + 1) % chunk_size == 0: + jj = 0 + yield chunk + else: + jj += 1 + # yield remainder + if jj: + yield chunk[:jj] def store_filtered_feature(rtdc_writer, feat, data, filtarr): diff --git a/tests/test_export.py b/tests/test_export.py index 341cdc12..4d9f81a6 100644 --- a/tests/test_export.py +++ b/tests/test_export.py @@ -1,3 +1,4 @@ +import collections import io import os from os.path import join @@ -10,7 +11,8 @@ import dclab from dclab import dfn, new_dataset, RTDCWriter -from dclab.rtdc_dataset.export import store_filtered_feature +from dclab.rtdc_dataset.export import ( + store_filtered_feature, yield_filtered_array_stacks) from helper_methods import example_data_dict, retrieve_data @@ -661,3 +663,68 @@ def test_tsv_not_filtered(): edest = tempfile.mkdtemp() f1 = join(edest, "test.tsv") ds.export.tsv(f1, keys, filtered=False) + + +def test_yield_filtered_array_stacks_array(): + enum = np.arange(547) + ebol = np.ones(547, dtype=bool) + ebol[10] = False + ebol[412] = False + + data = np.random.random((547, 80, 320)) + indices = enum[ebol] + stacked = [] + for chunk in yield_filtered_array_stacks(data, indices): + stacked.append(chunk) + assert len(stacked) == 55 + assert len(stacked[0]) == len(stacked[1]) + assert len(stacked[-1]) == 547 - 2 - 54 * len(stacked[0]) + + data2 = np.concatenate(stacked, axis=0) + assert len(data2) == 547 - 2 + assert data2[0].shape == data[0].shape + assert np.all(data[indices] == data2) + + +def test_yield_filtered_array_stacks_list(): + # custom list we will be using, which implements shape and dtype, but + # no __array__. + class ListFeat(collections.UserList): + def __init__(self, data): + super(ListFeat, self).__init__(data) + + @property + def shape(self): + return tuple([len(self.data)] + list(self.data[0].shape)) + + @property + def dtype(self): + return self.data[0].dtype + + enum = np.arange(547, dtype=int) + ebol = np.ones(547, dtype=bool) + ebol[10] = False + ebol[412] = False + + # instead of the above case, create a list of arrays here. + datalist = [] + for _ in range(547): + datalist.append(np.random.random((80, 320))) + data = ListFeat(datalist) + assert data.shape == (547, 80, 320) + assert data.dtype == datalist[0].dtype + assert np.all(data[0] == datalist[0]) + + indices = enum[ebol] + stacked = [] + for chunk in yield_filtered_array_stacks(data, indices): + stacked.append(np.array(chunk, copy=True)) + assert len(stacked) == 55 + assert len(stacked[0]) == len(stacked[1]) + assert len(stacked[-1]) == 547 - 2 - 54 * len(stacked[0]) + assert np.all(stacked[0][0] == datalist[0]) + + data2 = np.concatenate(stacked, axis=0) + assert len(data2) == 547 - 2 + assert data2[0].shape == data[0].shape + assert np.all(np.array(datalist)[indices] == data2) diff --git a/tests/test_rtdc_fmt_hdf5.py b/tests/test_rtdc_fmt_hdf5.py index 5f0e7d58..c34a2b51 100644 --- a/tests/test_rtdc_fmt_hdf5.py +++ b/tests/test_rtdc_fmt_hdf5.py @@ -306,7 +306,7 @@ def test_defective_feature_volume(): def test_discouraged_array_dunder_childndarray(): ds = new_dataset(retrieve_data("fmt-hdf5_fl_wide-channel_2023.zip")) with pytest.warns(UserWarning, match="It may consume a lot of memory"): - maskbool = ds["mask"].__array__() + ds["mask"].__array__() @pytest.mark.filterwarnings(