Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG(string dtype): groupby/resampler.min/max returns float on all NA strings #60985

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@ Conversion

Strings
^^^^^^^
- Bug in :meth:`.DataFrameGroupBy.min`, :meth:`.DataFrameGroupBy.max`, :meth:`.Resampler.min`, :meth:`.Resampler.max` on string input of all NA values would return float dtype; now returns string (:issue:`60810`)
- Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` on :class:`StringDtype` with all NA values resulted in ``0`` and is now the empty string ``""`` (:issue:`60229`)
- Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` did not raise for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`)
- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
- Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`)
Expand Down
10 changes: 9 additions & 1 deletion pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2628,7 +2628,15 @@ def _groupby_op(
if op.how not in ["any", "all"]:
# Fail early to avoid conversion to object
op._get_cython_function(op.kind, op.how, np.dtype(object), False)
npvalues = self.to_numpy(object, na_value=np.nan)

arr = self
if op.how == "sum":
# https://github.com/pandas-dev/pandas/issues/60229
# All NA should result in the empty string.
assert "skipna" in kwargs
if kwargs["skipna"] and min_count == 0:
arr = arr.fillna("")
npvalues = arr.to_numpy(object, na_value=np.nan)
else:
raise NotImplementedError(
f"function is not implemented for this dtype: {self.dtype}"
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ class providing the base-class of operations.
is_numeric_dtype,
is_object_dtype,
is_scalar,
is_string_dtype,
needs_i8_conversion,
pandas_dtype,
)
Expand Down Expand Up @@ -1725,6 +1726,10 @@ def _agg_py_fallback(

if ser.dtype == object:
res_values = res_values.astype(object, copy=False)
elif is_string_dtype(ser.dtype) and how in ["min", "max"]:
dtype = ser.dtype
string_array_cls = dtype.construct_array_type()
res_values = string_array_cls._from_sequence(res_values, dtype=dtype)

# If we are DataFrameGroupBy and went through a SeriesGroupByPath
# then we need to reshape
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/frame/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -835,6 +835,16 @@ def test_axis_1_empty(self, all_reductions, index):
expected = Series([], index=index, dtype=expected_dtype)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("min_count", [0, 1])
def test_axis_1_sum_na(self, string_dtype_no_object, skipna, min_count):
# https://github.com/pandas-dev/pandas/issues/60229
dtype = string_dtype_no_object
df = DataFrame({"a": [pd.NA]}, dtype=dtype)
result = df.sum(axis=1, skipna=skipna, min_count=min_count)
value = "" if skipna and min_count == 0 else pd.NA
expected = Series([value], dtype=dtype)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)])
@pytest.mark.parametrize("numeric_only", [None, True, False])
def test_sum_prod_nanops(self, method, unit, numeric_only):
Expand Down
93 changes: 93 additions & 0 deletions pandas/tests/groupby/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
isna,
)
import pandas._testing as tm
from pandas.tests.groupby import get_groupby_method_args
from pandas.util import _test_decorators as td


Expand Down Expand Up @@ -955,6 +956,98 @@ def test_min_empty_string_dtype(func, string_dtype_no_object):
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("min_count", [0, 1])
@pytest.mark.parametrize("test_series", [True, False])
def test_string_dtype_all_na(
string_dtype_no_object, reduction_func, skipna, min_count, test_series
):
# https://github.com/pandas-dev/pandas/issues/60985
if reduction_func == "corrwith":
# corrwith is deprecated.
return

dtype = string_dtype_no_object

if reduction_func in [
"any",
"all",
"idxmin",
"idxmax",
"mean",
"median",
"std",
"var",
]:
kwargs = {"skipna": skipna}
elif reduction_func in ["kurt"]:
kwargs = {"min_count": min_count}
elif reduction_func in ["count", "nunique", "quantile", "sem", "size"]:
kwargs = {}
else:
kwargs = {"skipna": skipna, "min_count": min_count}

expected_dtype, expected_value = dtype, pd.NA
if reduction_func in ["all", "any"]:
expected_dtype = "bool"
# TODO: For skipna=False, bool(pd.NA) raises; should groupby?
expected_value = not skipna if reduction_func == "any" else True
elif reduction_func in ["count", "nunique", "size"]:
# TODO: Should be more consistent - return Int64 when dtype.na_value is pd.NA?
if (
test_series
and reduction_func == "size"
and dtype.storage == "pyarrow"
and dtype.na_value is pd.NA
):
expected_dtype = "Int64"
else:
expected_dtype = "int64"
expected_value = 1 if reduction_func == "size" else 0
elif reduction_func in ["idxmin", "idxmax"]:
expected_dtype, expected_value = "float64", np.nan
elif not skipna or min_count > 0:
expected_value = pd.NA
elif reduction_func == "sum":
# https://github.com/pandas-dev/pandas/pull/60936
expected_value = ""

df = DataFrame({"a": ["x"], "b": [pd.NA]}, dtype=dtype)
obj = df["b"] if test_series else df
args = get_groupby_method_args(reduction_func, obj)
gb = obj.groupby(df["a"])
method = getattr(gb, reduction_func)

if reduction_func in [
"mean",
"median",
"kurt",
"prod",
"quantile",
"sem",
"skew",
"std",
"var",
]:
msg = f"dtype '{dtype}' does not support operation '{reduction_func}'"
with pytest.raises(TypeError, match=msg):
method(*args, **kwargs)
return
elif reduction_func in ["idxmin", "idxmax"] and not skipna:
msg = f"{reduction_func} with skipna=False encountered an NA value."
with pytest.raises(ValueError, match=msg):
method(*args, **kwargs)
return

result = method(*args, **kwargs)
index = pd.Index(["x"], name="a", dtype=dtype)
if test_series or reduction_func == "size":
name = None if not test_series and reduction_func == "size" else "b"
expected = Series(expected_value, index=index, dtype=expected_dtype, name=name)
else:
expected = DataFrame({"b": expected_value}, index=index, dtype=expected_dtype)
tm.assert_equal(result, expected)


def test_max_nan_bug():
df = DataFrame(
{
Expand Down
25 changes: 25 additions & 0 deletions pandas/tests/resample/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,31 @@ def test_resample_empty_series(freq, index, resample_method):
assert result.index.freq == expected.index.freq


@pytest.mark.parametrize("min_count", [0, 1])
def test_resample_empty_sum_string(string_dtype_no_object, min_count):
# https://github.com/pandas-dev/pandas/issues/60229
dtype = string_dtype_no_object
ser = Series(
pd.NA,
index=DatetimeIndex(
[
"2000-01-01 00:00:00",
"2000-01-01 00:00:10",
"2000-01-01 00:00:20",
"2000-01-01 00:00:30",
]
),
dtype=dtype,
)
rs = ser.resample("20s")
result = rs.sum(min_count=min_count)

value = "" if min_count == 0 else pd.NA
index = date_range(start="2000-01-01", freq="20s", periods=2, unit="s")
expected = Series(value, index=index, dtype=dtype)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"freq",
[
Expand Down
23 changes: 20 additions & 3 deletions pandas/tests/resample/test_resampler_grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat import is_platform_windows

import pandas as pd
Expand Down Expand Up @@ -462,7 +460,6 @@ def test_empty(keys):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.parametrize("consolidate", [True, False])
def test_resample_groupby_agg_object_dtype_all_nan(consolidate):
# https://github.com/pandas-dev/pandas/issues/39329
Expand Down Expand Up @@ -494,6 +491,26 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate):
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("min_count", [0, 1])
def test_groupby_resample_empty_sum_string(
string_dtype_no_object, test_frame, min_count
):
# https://github.com/pandas-dev/pandas/issues/60229
dtype = string_dtype_no_object
test_frame = test_frame.assign(B=pd.array([pd.NA] * len(test_frame), dtype=dtype))
gbrs = test_frame.groupby("A").resample("40s")
result = gbrs.sum(min_count=min_count)

index = pd.MultiIndex(
levels=[[1, 2, 3], [pd.to_datetime("2000-01-01", unit="ns")]],
codes=[[0, 1, 2], [0, 0, 0]],
names=["A", None],
)
value = "" if min_count == 0 else pd.NA
expected = DataFrame({"B": value}, index=index, dtype=dtype)
tm.assert_frame_equal(result, expected)


def test_groupby_resample_with_list_of_keys():
# GH 47362
df = DataFrame(
Expand Down
Loading