diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 09134763977c3..3b4b9904bee5a 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -118,6 +118,8 @@ Conversion Strings ^^^^^^^ +- Bug in :meth:`.DataFrameGroupBy.min`, :meth:`.DataFrameGroupBy.max`, :meth:`.Resampler.min`, :meth:`.Resampler.max` on string input of all NA values would return float dtype; now returns string (:issue:`60810`) +- Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` on :class:`StringDtype` with all NA values resulted in ``0`` and is now the empty string ``""`` (:issue:`60229`) - Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` did not raise for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`) - Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`) - Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 33745438e2aea..dbf2090e53579 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2628,7 +2628,15 @@ def _groupby_op( if op.how not in ["any", "all"]: # Fail early to avoid conversion to object op._get_cython_function(op.kind, op.how, np.dtype(object), False) - npvalues = self.to_numpy(object, na_value=np.nan) + + arr = self + if op.how == "sum": + # https://github.com/pandas-dev/pandas/issues/60229 + # All NA should result in the empty string. + assert "skipna" in kwargs + if kwargs["skipna"] and min_count == 0: + arr = arr.fillna("") + npvalues = arr.to_numpy(object, na_value=np.nan) else: raise NotImplementedError( f"function is not implemented for this dtype: {self.dtype}" diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d0c0ed29b6d44..6ce95fd4e645a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -81,6 +81,7 @@ class providing the base-class of operations. is_numeric_dtype, is_object_dtype, is_scalar, + is_string_dtype, needs_i8_conversion, pandas_dtype, ) @@ -1725,6 +1726,10 @@ def _agg_py_fallback( if ser.dtype == object: res_values = res_values.astype(object, copy=False) + elif is_string_dtype(ser.dtype) and how in ["min", "max"]: + dtype = ser.dtype + string_array_cls = dtype.construct_array_type() + res_values = string_array_cls._from_sequence(res_values, dtype=dtype) # If we are DataFrameGroupBy and went through a SeriesGroupByPath # then we need to reshape diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 64e686d25faa7..127f0fc50a747 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -835,6 +835,16 @@ def test_axis_1_empty(self, all_reductions, index): expected = Series([], index=index, dtype=expected_dtype) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("min_count", [0, 1]) + def test_axis_1_sum_na(self, string_dtype_no_object, skipna, min_count): + # https://github.com/pandas-dev/pandas/issues/60229 + dtype = string_dtype_no_object + df = DataFrame({"a": [pd.NA]}, dtype=dtype) + result = df.sum(axis=1, skipna=skipna, min_count=min_count) + value = "" if skipna and min_count == 0 else pd.NA + expected = Series([value], dtype=dtype) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)]) @pytest.mark.parametrize("numeric_only", [None, True, False]) def test_sum_prod_nanops(self, method, unit, numeric_only): diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index ea876cfdf4933..014558bbf4bba 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -20,6 +20,7 @@ isna, ) import pandas._testing as tm +from pandas.tests.groupby import get_groupby_method_args from pandas.util import _test_decorators as td @@ -955,6 +956,98 @@ def test_min_empty_string_dtype(func, string_dtype_no_object): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("min_count", [0, 1]) +@pytest.mark.parametrize("test_series", [True, False]) +def test_string_dtype_all_na( + string_dtype_no_object, reduction_func, skipna, min_count, test_series +): + # https://github.com/pandas-dev/pandas/issues/60985 + if reduction_func == "corrwith": + # corrwith is deprecated. + return + + dtype = string_dtype_no_object + + if reduction_func in [ + "any", + "all", + "idxmin", + "idxmax", + "mean", + "median", + "std", + "var", + ]: + kwargs = {"skipna": skipna} + elif reduction_func in ["kurt"]: + kwargs = {"min_count": min_count} + elif reduction_func in ["count", "nunique", "quantile", "sem", "size"]: + kwargs = {} + else: + kwargs = {"skipna": skipna, "min_count": min_count} + + expected_dtype, expected_value = dtype, pd.NA + if reduction_func in ["all", "any"]: + expected_dtype = "bool" + # TODO: For skipna=False, bool(pd.NA) raises; should groupby? + expected_value = not skipna if reduction_func == "any" else True + elif reduction_func in ["count", "nunique", "size"]: + # TODO: Should be more consistent - return Int64 when dtype.na_value is pd.NA? + if ( + test_series + and reduction_func == "size" + and dtype.storage == "pyarrow" + and dtype.na_value is pd.NA + ): + expected_dtype = "Int64" + else: + expected_dtype = "int64" + expected_value = 1 if reduction_func == "size" else 0 + elif reduction_func in ["idxmin", "idxmax"]: + expected_dtype, expected_value = "float64", np.nan + elif not skipna or min_count > 0: + expected_value = pd.NA + elif reduction_func == "sum": + # https://github.com/pandas-dev/pandas/pull/60936 + expected_value = "" + + df = DataFrame({"a": ["x"], "b": [pd.NA]}, dtype=dtype) + obj = df["b"] if test_series else df + args = get_groupby_method_args(reduction_func, obj) + gb = obj.groupby(df["a"]) + method = getattr(gb, reduction_func) + + if reduction_func in [ + "mean", + "median", + "kurt", + "prod", + "quantile", + "sem", + "skew", + "std", + "var", + ]: + msg = f"dtype '{dtype}' does not support operation '{reduction_func}'" + with pytest.raises(TypeError, match=msg): + method(*args, **kwargs) + return + elif reduction_func in ["idxmin", "idxmax"] and not skipna: + msg = f"{reduction_func} with skipna=False encountered an NA value." + with pytest.raises(ValueError, match=msg): + method(*args, **kwargs) + return + + result = method(*args, **kwargs) + index = pd.Index(["x"], name="a", dtype=dtype) + if test_series or reduction_func == "size": + name = None if not test_series and reduction_func == "size" else "b" + expected = Series(expected_value, index=index, dtype=expected_dtype, name=name) + else: + expected = DataFrame({"b": expected_value}, index=index, dtype=expected_dtype) + tm.assert_equal(result, expected) + + def test_max_nan_bug(): df = DataFrame( { diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index b2d9f6c0e3eb0..0db5c0c82d4d4 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -223,6 +223,31 @@ def test_resample_empty_series(freq, index, resample_method): assert result.index.freq == expected.index.freq +@pytest.mark.parametrize("min_count", [0, 1]) +def test_resample_empty_sum_string(string_dtype_no_object, min_count): + # https://github.com/pandas-dev/pandas/issues/60229 + dtype = string_dtype_no_object + ser = Series( + pd.NA, + index=DatetimeIndex( + [ + "2000-01-01 00:00:00", + "2000-01-01 00:00:10", + "2000-01-01 00:00:20", + "2000-01-01 00:00:30", + ] + ), + dtype=dtype, + ) + rs = ser.resample("20s") + result = rs.sum(min_count=min_count) + + value = "" if min_count == 0 else pd.NA + index = date_range(start="2000-01-01", freq="20s", periods=2, unit="s") + expected = Series(value, index=index, dtype=dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "freq", [ diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index e7850f96b3b0f..286625b8ce470 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import is_platform_windows import pandas as pd @@ -462,7 +460,6 @@ def test_empty(keys): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("consolidate", [True, False]) def test_resample_groupby_agg_object_dtype_all_nan(consolidate): # https://github.com/pandas-dev/pandas/issues/39329 @@ -494,6 +491,26 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("min_count", [0, 1]) +def test_groupby_resample_empty_sum_string( + string_dtype_no_object, test_frame, min_count +): + # https://github.com/pandas-dev/pandas/issues/60229 + dtype = string_dtype_no_object + test_frame = test_frame.assign(B=pd.array([pd.NA] * len(test_frame), dtype=dtype)) + gbrs = test_frame.groupby("A").resample("40s") + result = gbrs.sum(min_count=min_count) + + index = pd.MultiIndex( + levels=[[1, 2, 3], [pd.to_datetime("2000-01-01", unit="ns")]], + codes=[[0, 1, 2], [0, 0, 0]], + names=["A", None], + ) + value = "" if min_count == 0 else pd.NA + expected = DataFrame({"B": value}, index=index, dtype=dtype) + tm.assert_frame_equal(result, expected) + + def test_groupby_resample_with_list_of_keys(): # GH 47362 df = DataFrame(