Skip to content

Commit

Permalink
Merge remote-tracking branch 'upsteam/RELEASE_next_patch' into RELEAS…
Browse files Browse the repository at this point in the history
…E_next_minor
  • Loading branch information
ericpre committed Sep 13, 2024
2 parents fcba3b4 + a900179 commit 5a310b5
Show file tree
Hide file tree
Showing 8 changed files with 211 additions and 144 deletions.
18 changes: 9 additions & 9 deletions hyperspy/_signals/lazy.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
_requires_linear_rebin,
get_signal_chunk_slice,
)
from hyperspy.misc.hist_tools import histogram_dask
from hyperspy.misc.hist_tools import _set_histogram_metadata, histogram_dask
from hyperspy.misc.machine_learning import import_sklearn
from hyperspy.misc.utils import dummy_context_manager, isiterable, multiply
from hyperspy.signal import BaseSignal
Expand Down Expand Up @@ -724,14 +724,13 @@ def valuemin(self, axis, out=None, rechunk=False):

valuemin.__doc__ = BaseSignal.valuemin.__doc__

def get_histogram(self, bins="fd", out=None, rechunk=False, **kwargs):
if "range_bins" in kwargs:
_logger.warning("'range_bins' argument not supported for lazy " "signals")
del kwargs["range_bins"]
def get_histogram(
self, bins="fd", range_bins=None, out=None, rechunk=False, **kwargs
):
from hyperspy.signals import Signal1D

data = self._lazy_data(rechunk=rechunk).flatten()
hist, bin_edges = histogram_dask(data, bins=bins, **kwargs)
hist, bin_edges = histogram_dask(data, bins=bins, range=range_bins, **kwargs)
if out is None:
hist_spec = Signal1D(hist)
hist_spec._lazy = True
Expand All @@ -741,12 +740,13 @@ def get_histogram(self, bins="fd", out=None, rechunk=False, **kwargs):
# we always overwrite the data because the computation is lazy ->
# the result signal is lazy. Assume that the `out` is already lazy
hist_spec.data = hist

hist_spec.axes_manager[0].scale = bin_edges[1] - bin_edges[0]
hist_spec.axes_manager[0].offset = bin_edges[0]
hist_spec.axes_manager[0].size = hist.shape[-1]
hist_spec.axes_manager[0].name = "value"
hist_spec.axes_manager[0].is_binned = True
hist_spec.metadata.General.title = self.metadata.General.title + " histogram"

_set_histogram_metadata(self, hist_spec, **kwargs)

if out is None:
return hist_spec
else:
Expand Down
10 changes: 8 additions & 2 deletions hyperspy/_signals/signal1d.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,8 +320,14 @@ def _spikes_diagnosis(
# arbitrary cutoff for number of spectra necessary before histogram
# data is compressed by finding maxima of each spectrum
tmp = BaseSignal(der) if n < 2000 else BaseSignal(np.ravel(der.max(-1)))

s_ = tmp.get_histogram(**kwargs)
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
category=Warning,
message="Estimated number of bins using",
module="hyperspy",
)
s_ = tmp.get_histogram(**kwargs)
s_.axes_manager[0].name = "Derivative magnitude"
s_.metadata.Signal.quantity = "Counts"
s_.metadata.General.title = "Spikes Analysis"
Expand Down
154 changes: 76 additions & 78 deletions hyperspy/docstrings/signal.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,96 +51,94 @@
the chunks are optimised for the new axes configuration."""

RECHUNK_ARG = """rechunk : bool
Only has effect when operating on lazy signal. Default ``False``,
which means the chunking structure will be retained. If ``True``,
the data may be automatically rechunked before performing this
operation."""
Only has effect when operating on lazy signal. Default ``False``,
which means the chunking structure will be retained. If ``True``,
the data may be automatically rechunked before performing this
operation."""

SHOW_PROGRESSBAR_ARG = """show_progressbar : None or bool
If ``True``, display a progress bar. If ``None``, the default from
the preferences settings is used."""
If ``True``, display a progress bar. If ``None``, the default from
the preferences settings is used."""

LAZY_OUTPUT_ARG = """lazy_output : None or bool
If ``True``, the output will be returned as a lazy signal. This means
the calculation itself will be delayed until either compute() is used,
or the signal is stored as a file.
If ``False``, the output will be returned as a non-lazy signal, this
means the outputs will be calculated directly, and loaded into memory.
If ``None`` the output will be lazy if the input signal is lazy, and
non-lazy if the input signal is non-lazy."""
If ``True``, the output will be returned as a lazy signal. This means
the calculation itself will be delayed until either compute() is used,
or the signal is stored as a file.
If ``False``, the output will be returned as a non-lazy signal, this
means the outputs will be calculated directly, and loaded into memory.
If ``None`` the output will be lazy if the input signal is lazy, and
non-lazy if the input signal is non-lazy."""

NUM_WORKERS_ARG = """num_workers : None or int
Number of worker used by dask. If None, default
to dask default value."""
Number of worker used by dask. If None, default
to dask default value."""

CLUSTER_SIGNALS_ARG = """signal : {"mean", "sum", "centroid"}, optional
If "mean" or "sum" return the mean signal or sum respectively
over each cluster. If "centroid", returns the signals closest
to the centroid."""
If "mean" or "sum" return the mean signal or sum respectively
over each cluster. If "centroid", returns the signals closest
to the centroid."""

HISTOGRAM_BIN_ARGS = """bins : int or sequence of float or str, default "fd"
If ``bins`` is an int, it defines the number of equal-width
bins in the given range. If ``bins`` is a
sequence, it defines the bin edges, including the rightmost
edge, allowing for non-uniform bin widths.
If ``bins`` is a string from the list below, will use
the method chosen to calculate the optimal bin width and
consequently the number of bins (see Notes for more detail on
the estimators) from the data that falls within the requested
range. While the bin width will be optimal for the actual data
in the range, the number of bins will be computed to fill the
entire range, including the empty portions. For visualisation,
using the ``'auto'`` option is suggested. Weighted data is not
supported for automated bin size selection.
'auto'
Maximum of the 'sturges' and 'fd' estimators. Provides good
all around performance.
'fd' (Freedman Diaconis Estimator)
Robust (resilient to outliers) estimator that takes into
account data variability and data size.
'doane'
An improved version of Sturges' estimator that works better
with non-normal datasets.
'scott'
Less robust estimator that that takes into account data
variability and data size.
'stone'
Estimator based on leave-one-out cross-validation estimate of
the integrated squared error. Can be regarded as a generalization
of Scott's rule.
'rice'
Estimator does not take variability into account, only data
size. Commonly overestimates number of bins required.
'sturges'
R's default method, only accounts for data size. Only
optimal for gaussian data and underestimates number of bins
for large non-gaussian datasets.
'sqrt'
Square root (of data size) estimator, used by Excel and
other programs for its speed and simplicity.
'knuth'
Knuth's rule is a fixed-width, Bayesian approach to determining
the optimal bin width of a histogram.
'blocks'
Determination of optimal adaptive-width histogram bins using
the Bayesian Blocks algorithm.
"""
If ``bins`` is an int, it defines the number of equal-width
bins in the given range. If ``bins`` is a
sequence, it defines the bin edges, including the rightmost
edge, allowing for non-uniform bin widths.
If ``bins`` is a string from the list below, will use
the method chosen to calculate the optimal bin width and
consequently the number of bins (see Notes for more detail on
the estimators) from the data that falls within the requested
range. While the bin width will be optimal for the actual data
in the range, the number of bins will be computed to fill the
entire range, including the empty portions. For visualisation,
using the ``'auto'`` option is suggested. Weighted data is not
supported for automated bin size selection.
Possible strings are:
- ``'auto'`` : Maximum of the 'sturges' and 'fd' estimators.
Provides good all around performance.
- ``'fd'`` : Freedman Diaconis Estimator, robust
(resilient to outliers) estimator that takes into
account data variability and data size.
- ``'doane'`` : An improved version of Sturges' estimator
that works better with non-normal datasets.
- ``'scott'`` : Less robust estimator that that takes into
account data variability and data size.
- ``'stone'`` : Estimator based on leave-one-out cross-validation
estimate of the integrated squared error. Can be regarded
as a generalization of Scott's rule.
- ``'rice'`` : Estimator does not take variability into account,
only data size. Commonly overestimates number of bins required.
- ``'sturges'`` : R's default method, only accounts for data size.
Only optimal for gaussian data and underestimates number
of bins for large non-gaussian datasets.
- ``'sqrt'`` : Square root (of data size) estimator, used by Excel
and other programs for its speed and simplicity.
- ``'knuth'`` : Knuth's rule is a fixed-width, Bayesian approach to
determining the optimal bin width of a histogram.
- ``'blocks'`` : Determination of optimal adaptive-width histogram
bins using the Bayesian Blocks algorithm."""

HISTOGRAM_MAX_BIN_ARGS = """max_num_bins : int, default 250
When estimating the bins using one of the str methods, the
number of bins is capped by this number to avoid a MemoryError
being raised by :func:`numpy.histogram`."""
When estimating the bins using one of the str methods, the
number of bins is capped by this number to avoid a MemoryError
being raised by :func:`numpy.histogram`."""

HISTOGRAM_RANGE_ARGS = """range_bins : (float, float), optional
The lower and upper limit of the range of bins. If not provided,
range is simply ``(a.min(), a.max())``. Values outside the range are
ignored. The first element of the range must be less than or
equal to the second. `range` affects the automatic bin
computation as well. While bin width is computed to be optimal
based on the actual data within `range`, the bin count will fill
the entire range including portions containing no data."""

HISTOGRAM_WEIGHTS_ARGS = """weights : array_like, optional
An array of weights, of the same shape as `a`. Each value in
`a` only contributes its associated weight towards the bin count
(instead of 1). This is currently not used by any of the bin estimators,
but may be in the future."""

SIGNAL_MASK_ARG = """signal_mask : numpy.ndarray of bool
Restricts the operation to the signal locations not marked
Expand Down
16 changes: 6 additions & 10 deletions hyperspy/drawing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import hyperspy
import hyperspy.api as hs
from hyperspy.defaults_parser import preferences
from hyperspy.docstrings.signal import HISTOGRAM_BIN_ARGS, HISTOGRAM_RANGE_ARGS
from hyperspy.misc.utils import to_numpy

_logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -1753,16 +1754,8 @@ def plot_histograms(
signal_list : iterable
Ordered list of spectra to plot. If ``style`` is ``"cascade"`` or
``"mosaic"``, the spectra can have different size and axes.
bins : int, list or str, optional
If bins is a string, then it must be one of:
- ``'knuth'`` : use Knuth's rule to determine bins,
- ``'scott'`` : use Scott's rule to determine bins,
- ``'fd'`` : use the Freedman-diaconis rule to determine bins,
- ``'blocks'`` : use bayesian blocks for dynamic bin widths.
range_bins : None or tuple, optional
The minimum and maximum range for the histogram. If not specified,
it will be (``x.min()``, ``x.max()``).
%s
%s
color : None, (list of) matplotlib color, optional
Sets the color of the lines of the plots. For a list, if its length is
less than the number of spectra to plot, the colors will be cycled.
Expand Down Expand Up @@ -1813,6 +1806,9 @@ def plot_histograms(
)


plot_histograms.__doc__ %= (HISTOGRAM_BIN_ARGS, HISTOGRAM_RANGE_ARGS)


def picker_kwargs(value, kwargs=None):
if kwargs is None:
kwargs = {}
Expand Down
Loading

0 comments on commit 5a310b5

Please sign in to comment.