From 684a1a3056a9aac89b7438fe41fcd81d1a1b2158 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sat, 22 Feb 2025 13:51:16 -0500 Subject: [PATCH] Backport PR #60938: ENH(string dtype): Implement cumsum for Python-backed strings (#60984) * ENH: Improved error message and raise new error for small-string NaN edge case in HDFStore.append (#60829) * Add clearer error messages for datatype mismatch in HDFStore.append. Raise ValueError when nan_rep too large for pytable column. Add and modify applicable test code. * Fix missed tests and correct mistake in error message. * Remove excess comments. Reverse error type change to avoid api changes. Move nan_rep tests into separate function. (cherry picked from commit 57340ecd08580f26ee4a976c1f68b2f563c41569) * TST(string dtype): Resolve xfails in pytables (#60795) (cherry picked from commit 4511251ccf409f2ba71cab0283bdf751697ee539) * BUG(string dtype): Resolve pytables xfail when reading with condition (#60943) (cherry picked from commit 0ec5f2668e9568d90595180d5ee925305ec7182e) * Backport PR #60940: ENH: Add dtype argument to str.decode * Backport PR #60938: ENH(string dtype): Implement cumsum for Python-backed strings --------- Co-authored-by: Jake Thomas Trevallion <136272202+JakeTT404@users.noreply.github.com> --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/core/arrays/string_.py | 83 ++++++++++++++++++++++++++ pandas/tests/apply/test_str.py | 15 +---- pandas/tests/extension/test_string.py | 6 +- pandas/tests/series/test_cumulative.py | 11 ++-- 5 files changed, 92 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index c4e01a86ce843..db3dcb50bacd0 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -38,7 +38,7 @@ Other enhancements - :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`) - :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`) - The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`) -- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`) +- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 3efb48c86e92c..c1048e806ff9a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -46,6 +46,7 @@ ) from pandas.core import ( + missing, nanops, ops, ) @@ -865,6 +866,88 @@ def _reduce( return result raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArray: + """ + Return an ExtensionArray performing an accumulation operation. + + The underlying data type might change. + + Parameters + ---------- + name : str + Name of the function, supported values are: + - cummin + - cummax + - cumsum + - cumprod + skipna : bool, default True + If True, skip NA values. + **kwargs + Additional keyword arguments passed to the accumulation function. + Currently, there is no supported kwarg. + + Returns + ------- + array + + Raises + ------ + NotImplementedError : subclass does not define accumulations + """ + if name == "cumprod": + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) + + # We may need to strip out trailing NA values + tail: np.ndarray | None = None + na_mask: np.ndarray | None = None + ndarray = self._ndarray + np_func = { + "cumsum": np.cumsum, + "cummin": np.minimum.accumulate, + "cummax": np.maximum.accumulate, + }[name] + + if self._hasna: + na_mask = cast("npt.NDArray[np.bool_]", isna(ndarray)) + if np.all(na_mask): + return type(self)(ndarray) + if skipna: + if name == "cumsum": + ndarray = np.where(na_mask, "", ndarray) + else: + # We can retain the running min/max by forward/backward filling. + ndarray = ndarray.copy() + missing.pad_or_backfill_inplace( + ndarray, + method="pad", + axis=0, + ) + missing.pad_or_backfill_inplace( + ndarray, + method="backfill", + axis=0, + ) + else: + # When not skipping NA values, the result should be null from + # the first NA value onward. + idx = np.argmax(na_mask) + tail = np.empty(len(ndarray) - idx, dtype="object") + tail[:] = self.dtype.na_value + ndarray = ndarray[:idx] + + # mypy: Cannot call function of unknown type + np_result = np_func(ndarray) # type: ignore[operator] + + if tail is not None: + np_result = np.hstack((np_result, tail)) + elif na_mask is not None: + # Argument 2 to "where" has incompatible type "NAType | float" + np_result = np.where(na_mask, self.dtype.na_value, np_result) # type: ignore[arg-type] + + result = type(self)(np_result) + return result + def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: if self.dtype.na_value is np.nan and result is libmissing.NA: # the masked_reductions use pd.NA -> convert to np.nan diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index 9c7836a0aa167..17e8322dc40e1 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas.compat import HAS_PYARROW - from pandas.core.dtypes.common import is_number from pandas import ( @@ -168,21 +166,10 @@ def test_agg_cython_table_series(series, func, expected): ), ), ) -def test_agg_cython_table_transform_series(request, series, func, expected): +def test_agg_cython_table_transform_series(series, func, expected): # GH21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) - if ( - series.dtype == "string" - and func in ("cumsum", np.cumsum, np.nancumsum) - and not HAS_PYARROW - ): - request.applymarker( - pytest.mark.xfail( - raises=NotImplementedError, - reason="TODO(infer_string) cumsum not yet implemented for string", - ) - ) warn = None if isinstance(func, str) else FutureWarning with tm.assert_produces_warning(warn, match="is currently using Series.*"): result = series.agg(func) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 301c7ee851aa0..526cf426781ad 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -200,11 +200,7 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: assert isinstance(ser.dtype, StorageExtensionDtype) - return ser.dtype.storage == "pyarrow" and op_name in [ - "cummin", - "cummax", - "cumsum", - ] + return op_name in ["cummin", "cummax", "cumsum"] def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): dtype = cast(StringDtype, tm.get_dtype(obj)) diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index 0dc391db2182b..97f5fb4a9f96f 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -193,13 +193,14 @@ def test_cumprod_timedelta(self): ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]), ], ) - def test_cum_methods_pyarrow_strings( - self, pyarrow_string_dtype, data, op, skipna, expected_data + def test_cum_methods_ea_strings( + self, string_dtype_no_object, data, op, skipna, expected_data ): - # https://github.com/pandas-dev/pandas/pull/60633 - ser = pd.Series(data, dtype=pyarrow_string_dtype) + # https://github.com/pandas-dev/pandas/pull/60633 - pyarrow + # https://github.com/pandas-dev/pandas/pull/60938 - Python + ser = pd.Series(data, dtype=string_dtype_no_object) method = getattr(ser, op) - expected = pd.Series(expected_data, dtype=pyarrow_string_dtype) + expected = pd.Series(expected_data, dtype=string_dtype_no_object) result = method(skipna=skipna) tm.assert_series_equal(result, expected)