From ae0711da53684d8dc558071ed45f9c1099b36f2d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Paul=20M=C3=BCller?= <dev@craban.de>
Date: Fri, 5 Jan 2024 14:18:47 +0100
Subject: [PATCH] enh: favor array operations in `yield_filtered_array_stacks`
 (~4x speed-up)

---
 CHANGELOG                    |  3 +-
 dclab/rtdc_dataset/export.py | 60 ++++++++++++++++++++++---------
 tests/test_export.py         | 69 +++++++++++++++++++++++++++++++++++-
 tests/test_rtdc_fmt_hdf5.py  |  2 +-
 4 files changed, 114 insertions(+), 20 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 43bbca54..df0b5493 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,4 @@
-0.56.4
+0.57.0
  - fix: integer overflow in downsample_grid
  - fix: removed unnecessary computation of hierarchy filter instance
  - enh: cythonize downsample_grid (~10x speed-up)
@@ -9,6 +9,7 @@
  - enh: globally use chunk sizes of ~1MiB when writing HDF5 data
    (minor speedup since previously a chunk size of 100 events was used
    for images and scalar features were written in one big chunk)
+ - enh: favor array operations in `yield_filtered_array_stacks` (~4x speed-up)
  - ref: implement `__array__` methods for hierarchy event classes
  - ref: implement `__array__` method for `H5MaskEvent`
  - ref: new submodule for hierarchy format
diff --git a/dclab/rtdc_dataset/export.py b/dclab/rtdc_dataset/export.py
index 1decdde5..ed25070e 100644
--- a/dclab/rtdc_dataset/export.py
+++ b/dclab/rtdc_dataset/export.py
@@ -397,10 +397,12 @@ def yield_filtered_array_stacks(data, indices):
     Parameters
     ----------
     data: np.ndarray or h5py.Dataset
-        The full, unfiltered input feature data.
+        The full, unfiltered input feature data. Must implement
+        the `shape` and `dtype` properties. If it implements the
+        `__array__` method, fast slicing is used.
     indices: np.ndarray or list
-        The indices in data (first axis) that should be written
-        to the chunks returned by this generator.
+        The indices (integer values) for `data` (first axis), indicating
+        which elements should be returned by this generator.
 
     Notes
     -----
@@ -410,24 +412,48 @@ def yield_filtered_array_stacks(data, indices):
     and that the events in `data` all have the same shape.
     The dtype of the returned chunks is determined by the first
     item in `data`.
+
+    This method works with sliceable (e.g. np.ndarray) and
+    non-sliceable (e.g. tdms-format-based images) input data. If the
+    input data is sliceable (which is determined by the availability
+    of the `__array__` method, then fast numpy sclicing is used. If the
+    input data does not support slicing (`__array__` not defined), then
+    a slow iteration over `indices` is done.
+
+    In the slow iteration case, the returned array data are overridden
+    in-place. If you need to retain a copy of the `yield`ed chunks,
+    apply `np.array(.., copy=True)` to the returned chunks.
     """
     chunk_shape = RTDCWriter.get_best_nd_chunks(item_shape=data.shape[1:],
                                                 item_dtype=data.dtype)
     chunk_size = chunk_shape[0]
-    # assemble filtered image stacks
-    chunk = np.zeros(chunk_shape, dtype=data.dtype)
-
-    jj = 0
-    for ii in indices:
-        chunk[jj] = data[ii]
-        if (jj + 1) % chunk_size == 0:
-            jj = 0
-            yield chunk
-        else:
-            jj += 1
-    # yield remainder
-    if jj:
-        yield chunk[:jj]
+
+    if hasattr(data, "__array__"):
+        # We have an array-like object and can do slicing with the indexing
+        # array. This speeds up chunk creation for e.g. the HDF5 file format
+        # where all data are present in an array-like fashion.
+        indices = np.array(indices)
+        stop = 0
+        for kk in range(len(indices) // chunk_size):
+            start = chunk_size * kk
+            stop = chunk_size * (kk + 1)
+            yield data[indices[start:stop]]
+        if stop < len(indices):
+            yield data[indices[stop:]]
+    else:
+        # assemble filtered image stacks
+        chunk = np.zeros(chunk_shape, dtype=data.dtype)
+        jj = 0
+        for ii in indices:
+            chunk[jj] = data[ii]
+            if (jj + 1) % chunk_size == 0:
+                jj = 0
+                yield chunk
+            else:
+                jj += 1
+        # yield remainder
+        if jj:
+            yield chunk[:jj]
 
 
 def store_filtered_feature(rtdc_writer, feat, data, filtarr):
diff --git a/tests/test_export.py b/tests/test_export.py
index 341cdc12..4d9f81a6 100644
--- a/tests/test_export.py
+++ b/tests/test_export.py
@@ -1,3 +1,4 @@
+import collections
 import io
 import os
 from os.path import join
@@ -10,7 +11,8 @@
 
 import dclab
 from dclab import dfn, new_dataset, RTDCWriter
-from dclab.rtdc_dataset.export import store_filtered_feature
+from dclab.rtdc_dataset.export import (
+    store_filtered_feature, yield_filtered_array_stacks)
 
 from helper_methods import example_data_dict, retrieve_data
 
@@ -661,3 +663,68 @@ def test_tsv_not_filtered():
     edest = tempfile.mkdtemp()
     f1 = join(edest, "test.tsv")
     ds.export.tsv(f1, keys, filtered=False)
+
+
+def test_yield_filtered_array_stacks_array():
+    enum = np.arange(547)
+    ebol = np.ones(547, dtype=bool)
+    ebol[10] = False
+    ebol[412] = False
+
+    data = np.random.random((547, 80, 320))
+    indices = enum[ebol]
+    stacked = []
+    for chunk in yield_filtered_array_stacks(data, indices):
+        stacked.append(chunk)
+    assert len(stacked) == 55
+    assert len(stacked[0]) == len(stacked[1])
+    assert len(stacked[-1]) == 547 - 2 - 54 * len(stacked[0])
+
+    data2 = np.concatenate(stacked, axis=0)
+    assert len(data2) == 547 - 2
+    assert data2[0].shape == data[0].shape
+    assert np.all(data[indices] == data2)
+
+
+def test_yield_filtered_array_stacks_list():
+    # custom list we will be using, which implements shape and dtype, but
+    # no __array__.
+    class ListFeat(collections.UserList):
+        def __init__(self, data):
+            super(ListFeat, self).__init__(data)
+
+        @property
+        def shape(self):
+            return tuple([len(self.data)] + list(self.data[0].shape))
+
+        @property
+        def dtype(self):
+            return self.data[0].dtype
+
+    enum = np.arange(547, dtype=int)
+    ebol = np.ones(547, dtype=bool)
+    ebol[10] = False
+    ebol[412] = False
+
+    # instead of the above case, create a list of arrays here.
+    datalist = []
+    for _ in range(547):
+        datalist.append(np.random.random((80, 320)))
+    data = ListFeat(datalist)
+    assert data.shape == (547, 80, 320)
+    assert data.dtype == datalist[0].dtype
+    assert np.all(data[0] == datalist[0])
+
+    indices = enum[ebol]
+    stacked = []
+    for chunk in yield_filtered_array_stacks(data, indices):
+        stacked.append(np.array(chunk, copy=True))
+    assert len(stacked) == 55
+    assert len(stacked[0]) == len(stacked[1])
+    assert len(stacked[-1]) == 547 - 2 - 54 * len(stacked[0])
+    assert np.all(stacked[0][0] == datalist[0])
+
+    data2 = np.concatenate(stacked, axis=0)
+    assert len(data2) == 547 - 2
+    assert data2[0].shape == data[0].shape
+    assert np.all(np.array(datalist)[indices] == data2)
diff --git a/tests/test_rtdc_fmt_hdf5.py b/tests/test_rtdc_fmt_hdf5.py
index 5f0e7d58..c34a2b51 100644
--- a/tests/test_rtdc_fmt_hdf5.py
+++ b/tests/test_rtdc_fmt_hdf5.py
@@ -306,7 +306,7 @@ def test_defective_feature_volume():
 def test_discouraged_array_dunder_childndarray():
     ds = new_dataset(retrieve_data("fmt-hdf5_fl_wide-channel_2023.zip"))
     with pytest.warns(UserWarning, match="It may consume a lot of memory"):
-        maskbool = ds["mask"].__array__()
+        ds["mask"].__array__()
 
 
 @pytest.mark.filterwarnings(