From 9b03dd4d22550403b75d74f8b54b422bd31c55f2 Mon Sep 17 00:00:00 2001 From: 3w36zj6 <52315048+3w36zj6@users.noreply.github.com> Date: Sun, 2 Feb 2025 04:16:46 +0900 Subject: [PATCH 01/68] ENH: Add `Styler.to_typst()` (#60733) * ENH: Add `to_typst` method to `Styler` * TST: Add `Styler.to_typst()` test cases * STY: Apply Ruff suggestions * DOC: Update What's new * DOC: Update reference * CI: Add `Styler.template_typst` to validation ignore list * DOC: Update docstring format for `Styler.to_typst()` example * DOC: Update versionadded for `Styler.to_typst()` to 3.0.0 in documentation --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/reference/style.rst | 2 + doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/formats/style.py | 105 ++++++++++++++++++ pandas/io/formats/style_render.py | 16 +++ pandas/io/formats/templates/typst.tpl | 12 ++ .../tests/io/formats/style/test_to_typst.py | 96 ++++++++++++++++ scripts/validate_docstrings.py | 1 + 7 files changed, 233 insertions(+) create mode 100644 pandas/io/formats/templates/typst.tpl create mode 100644 pandas/tests/io/formats/style/test_to_typst.py diff --git a/doc/source/reference/style.rst b/doc/source/reference/style.rst index 0e1d93841d52f..742263c788c2f 100644 --- a/doc/source/reference/style.rst +++ b/doc/source/reference/style.rst @@ -27,6 +27,7 @@ Styler properties Styler.template_html_style Styler.template_html_table Styler.template_latex + Styler.template_typst Styler.template_string Styler.loader @@ -77,6 +78,7 @@ Style export and import Styler.to_html Styler.to_latex + Styler.to_typst Styler.to_excel Styler.to_string Styler.export diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a7f63d75a047e..64f4a66a109f5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -31,6 +31,7 @@ Other enhancements - :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`) - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) - :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`) +- Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`) - :class:`pandas.api.typing.NoDefault` is available for typing ``no_default`` - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 6f164c4b97514..3f37556867954 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -1228,6 +1228,111 @@ def to_latex( ) return save_to_buffer(latex, buf=buf, encoding=encoding) + @overload + def to_typst( + self, + buf: FilePath | WriteBuffer[str], + *, + encoding: str | None = ..., + sparse_index: bool | None = ..., + sparse_columns: bool | None = ..., + max_rows: int | None = ..., + max_columns: int | None = ..., + ) -> None: ... + + @overload + def to_typst( + self, + buf: None = ..., + *, + encoding: str | None = ..., + sparse_index: bool | None = ..., + sparse_columns: bool | None = ..., + max_rows: int | None = ..., + max_columns: int | None = ..., + ) -> str: ... + + @Substitution(buf=buffering_args, encoding=encoding_args) + def to_typst( + self, + buf: FilePath | WriteBuffer[str] | None = None, + *, + encoding: str | None = None, + sparse_index: bool | None = None, + sparse_columns: bool | None = None, + max_rows: int | None = None, + max_columns: int | None = None, + ) -> str | None: + """ + Write Styler to a file, buffer or string in Typst format. + + .. versionadded:: 3.0.0 + + Parameters + ---------- + %(buf)s + %(encoding)s + sparse_index : bool, optional + Whether to sparsify the display of a hierarchical index. Setting to False + will display each explicit level element in a hierarchical key for each row. + Defaults to ``pandas.options.styler.sparse.index`` value. + sparse_columns : bool, optional + Whether to sparsify the display of a hierarchical index. Setting to False + will display each explicit level element in a hierarchical key for each + column. Defaults to ``pandas.options.styler.sparse.columns`` value. + max_rows : int, optional + The maximum number of rows that will be rendered. Defaults to + ``pandas.options.styler.render.max_rows``, which is None. + max_columns : int, optional + The maximum number of columns that will be rendered. Defaults to + ``pandas.options.styler.render.max_columns``, which is None. + + Rows and columns may be reduced if the number of total elements is + large. This value is set to ``pandas.options.styler.render.max_elements``, + which is 262144 (18 bit browser rendering). + + Returns + ------- + str or None + If `buf` is None, returns the result as a string. Otherwise returns `None`. + + See Also + -------- + DataFrame.to_typst : Write a DataFrame to a file, + buffer or string in Typst format. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) + >>> df.style.to_typst() # doctest: +SKIP + + .. code-block:: typst + + #table( + columns: 3, + [], [A], [B], + + [0], [1], [3], + [1], [2], [4], + ) + """ + obj = self._copy(deepcopy=True) + + if sparse_index is None: + sparse_index = get_option("styler.sparse.index") + if sparse_columns is None: + sparse_columns = get_option("styler.sparse.columns") + + text = obj._render_typst( + sparse_columns=sparse_columns, + sparse_index=sparse_index, + max_rows=max_rows, + max_cols=max_columns, + ) + return save_to_buffer( + text, buf=buf, encoding=(encoding if buf is not None else None) + ) + @overload def to_html( self, diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index c0f0608f1ab32..2d1218b007d19 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -77,6 +77,7 @@ class StylerRenderer: template_html_table = env.get_template("html_table.tpl") template_html_style = env.get_template("html_style.tpl") template_latex = env.get_template("latex.tpl") + template_typst = env.get_template("typst.tpl") template_string = env.get_template("string.tpl") def __init__( @@ -232,6 +233,21 @@ def _render_latex( d.update(kwargs) return self.template_latex.render(**d) + def _render_typst( + self, + sparse_index: bool, + sparse_columns: bool, + max_rows: int | None = None, + max_cols: int | None = None, + **kwargs, + ) -> str: + """ + Render a Styler in typst format + """ + d = self._render(sparse_index, sparse_columns, max_rows, max_cols) + d.update(kwargs) + return self.template_typst.render(**d) + def _render_string( self, sparse_index: bool, diff --git a/pandas/io/formats/templates/typst.tpl b/pandas/io/formats/templates/typst.tpl new file mode 100644 index 0000000000000..66de8f31b405e --- /dev/null +++ b/pandas/io/formats/templates/typst.tpl @@ -0,0 +1,12 @@ +#table( + columns: {{ head[0] | length }}, +{% for r in head %} + {% for c in r %}[{% if c["is_visible"] %}{{ c["display_value"] }}{% endif %}],{% if not loop.last %} {% endif%}{% endfor %} + +{% endfor %} + +{% for r in body %} + {% for c in r %}[{% if c["is_visible"] %}{{ c["display_value"] }}{% endif %}],{% if not loop.last %} {% endif%}{% endfor %} + +{% endfor %} +) diff --git a/pandas/tests/io/formats/style/test_to_typst.py b/pandas/tests/io/formats/style/test_to_typst.py new file mode 100644 index 0000000000000..2365119c9c4dc --- /dev/null +++ b/pandas/tests/io/formats/style/test_to_typst.py @@ -0,0 +1,96 @@ +from textwrap import dedent + +import pytest + +from pandas import ( + DataFrame, + Series, +) + +pytest.importorskip("jinja2") +from pandas.io.formats.style import Styler + + +@pytest.fixture +def df(): + return DataFrame( + {"A": [0, 1], "B": [-0.61, -1.22], "C": Series(["ab", "cd"], dtype=object)} + ) + + +@pytest.fixture +def styler(df): + return Styler(df, uuid_len=0, precision=2) + + +def test_basic_table(styler): + result = styler.to_typst() + expected = dedent( + """\ + #table( + columns: 4, + [], [A], [B], [C], + + [0], [0], [-0.61], [ab], + [1], [1], [-1.22], [cd], + )""" + ) + assert result == expected + + +def test_concat(styler): + result = styler.concat(styler.data.agg(["sum"]).style).to_typst() + expected = dedent( + """\ + #table( + columns: 4, + [], [A], [B], [C], + + [0], [0], [-0.61], [ab], + [1], [1], [-1.22], [cd], + [sum], [1], [-1.830000], [abcd], + )""" + ) + assert result == expected + + +def test_concat_recursion(styler): + df = styler.data + styler1 = styler + styler2 = Styler(df.agg(["sum"]), uuid_len=0, precision=3) + styler3 = Styler(df.agg(["sum"]), uuid_len=0, precision=4) + result = styler1.concat(styler2.concat(styler3)).to_typst() + expected = dedent( + """\ + #table( + columns: 4, + [], [A], [B], [C], + + [0], [0], [-0.61], [ab], + [1], [1], [-1.22], [cd], + [sum], [1], [-1.830], [abcd], + [sum], [1], [-1.8300], [abcd], + )""" + ) + assert result == expected + + +def test_concat_chain(styler): + df = styler.data + styler1 = styler + styler2 = Styler(df.agg(["sum"]), uuid_len=0, precision=3) + styler3 = Styler(df.agg(["sum"]), uuid_len=0, precision=4) + result = styler1.concat(styler2).concat(styler3).to_typst() + expected = dedent( + """\ + #table( + columns: 4, + [], [A], [B], [C], + + [0], [0], [-0.61], [ab], + [1], [1], [-1.22], [cd], + [sum], [1], [-1.830], [abcd], + [sum], [1], [-1.8300], [abcd], + )""" + ) + assert result == expected diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 55acfaac4d843..944575dcc8659 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -45,6 +45,7 @@ "Styler.template_html_style", "Styler.template_html_table", "Styler.template_latex", + "Styler.template_typst", "Styler.template_string", "Styler.loader", "errors.InvalidComparison", From d72f165eb327898b1597efe75ff8b54032c3ae7b Mon Sep 17 00:00:00 2001 From: "Christine P. Chai" Date: Sat, 1 Feb 2025 11:18:25 -0800 Subject: [PATCH 02/68] DOC: Move NumPy Byte Order page in gotchas.rst (#60822) --- doc/source/user_guide/gotchas.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index 842f30f06676e..e85eead4e0f09 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -372,5 +372,5 @@ constructors using something similar to the following: s = pd.Series(newx) See `the NumPy documentation on byte order -`__ for more +`__ for more details. From f1441b218271178ebe18acecc3657f6549fb6c54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quang=20Nguy=E1=BB=85n?= <30631476+quangngd@users.noreply.github.com> Date: Mon, 3 Feb 2025 03:12:30 +0700 Subject: [PATCH 03/68] CHORE: Enable mistakenly ignored tests (#60827) Enable ignored tests --- pandas/tests/io/formats/test_to_string.py | 25 ++++++++++------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py index af3cdf2d44af3..1e8598c918efe 100644 --- a/pandas/tests/io/formats/test_to_string.py +++ b/pandas/tests/io/formats/test_to_string.py @@ -132,20 +132,17 @@ def test_to_string_with_formatters_unicode(self): ) assert result == expected - def test_to_string_index_formatter(self): - df = DataFrame([range(5), range(5, 10), range(10, 15)]) - - rs = df.to_string(formatters={"__index__": lambda x: "abc"[x]}) - - xp = dedent( - """\ - 0 1 2 3 4 - a 0 1 2 3 4 - b 5 6 7 8 9 - c 10 11 12 13 14\ - """ - ) - assert rs == xp + def test_to_string_index_formatter(self): + df = DataFrame([range(5), range(5, 10), range(10, 15)]) + rs = df.to_string(formatters={"__index__": lambda x: "abc"[x]}) + xp = dedent( + """\ + 0 1 2 3 4 + a 0 1 2 3 4 + b 5 6 7 8 9 + c 10 11 12 13 14""" + ) + assert rs == xp def test_no_extra_space(self): # GH#52690: Check that no extra space is given From a68048ea026f09fc56e1a9963c489ff0beaae651 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Mon, 3 Feb 2025 06:23:23 -0800 Subject: [PATCH 04/68] ENH: Support skipna parameter in GroupBy min, max, prod, median, var, std and sem methods (#60752) --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/_libs/groupby.pyi | 5 + pandas/_libs/groupby.pyx | 99 ++++++++++--- pandas/core/_numba/kernels/min_max_.py | 8 +- pandas/core/_numba/kernels/var_.py | 7 +- pandas/core/groupby/groupby.py | 76 ++++++++-- pandas/core/resample.py | 98 ++++++++++++- pandas/tests/groupby/aggregate/test_numba.py | 2 +- pandas/tests/groupby/test_api.py | 18 +-- pandas/tests/groupby/test_reductions.py | 141 +++++++++++++++++++ 10 files changed, 405 insertions(+), 51 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 64f4a66a109f5..9089b9cdd2185 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -59,9 +59,9 @@ Other enhancements - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) +- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`) - :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`) - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) -- :meth:`.DataFrameGroupBy.mean`, :meth:`.DataFrameGroupBy.sum`, :meth:`.SeriesGroupBy.mean` and :meth:`.SeriesGroupBy.sum` now accept ``skipna`` parameter (:issue:`15675`) - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index e3909203d1f5a..163fc23535022 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -13,6 +13,7 @@ def group_median_float64( mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., is_datetimelike: bool = ..., # bint + skipna: bool = ..., ) -> None: ... def group_cumprod( out: np.ndarray, # float64_t[:, ::1] @@ -76,6 +77,7 @@ def group_prod( mask: np.ndarray | None, result_mask: np.ndarray | None = ..., min_count: int = ..., + skipna: bool = ..., ) -> None: ... def group_var( out: np.ndarray, # floating[:, ::1] @@ -88,6 +90,7 @@ def group_var( result_mask: np.ndarray | None = ..., is_datetimelike: bool = ..., name: str = ..., + skipna: bool = ..., ) -> None: ... def group_skew( out: np.ndarray, # float64_t[:, ::1] @@ -183,6 +186,7 @@ def group_max( is_datetimelike: bool = ..., mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., + skipna: bool = ..., ) -> None: ... def group_min( out: np.ndarray, # groupby_t[:, ::1] @@ -193,6 +197,7 @@ def group_min( is_datetimelike: bool = ..., mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., + skipna: bool = ..., ) -> None: ... def group_idxmin_idxmax( out: npt.NDArray[np.intp], diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 70af22f514ce0..16a104a46ed3d 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -62,7 +62,12 @@ cdef enum InterpolationEnumType: INTERPOLATION_MIDPOINT -cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept nogil: +cdef float64_t median_linear_mask( + float64_t* a, + int n, + uint8_t* mask, + bint skipna=True +) noexcept nogil: cdef: int i, j, na_count = 0 float64_t* tmp @@ -77,7 +82,7 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n na_count += 1 if na_count: - if na_count == n: + if na_count == n or not skipna: return NaN tmp = malloc((n - na_count) * sizeof(float64_t)) @@ -104,7 +109,8 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n cdef float64_t median_linear( float64_t* a, int n, - bint is_datetimelike=False + bint is_datetimelike=False, + bint skipna=True, ) noexcept nogil: cdef: int i, j, na_count = 0 @@ -125,7 +131,7 @@ cdef float64_t median_linear( na_count += 1 if na_count: - if na_count == n: + if na_count == n or not skipna: return NaN tmp = malloc((n - na_count) * sizeof(float64_t)) @@ -186,6 +192,7 @@ def group_median_float64( const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 @@ -229,7 +236,7 @@ def group_median_float64( for j in range(ngroups): size = _counts[j + 1] - result = median_linear_mask(ptr, size, ptr_mask) + result = median_linear_mask(ptr, size, ptr_mask, skipna) out[j, i] = result if result != result: @@ -244,7 +251,7 @@ def group_median_float64( ptr += _counts[0] for j in range(ngroups): size = _counts[j + 1] - out[j, i] = median_linear(ptr, size, is_datetimelike) + out[j, i] = median_linear(ptr, size, is_datetimelike, skipna) ptr += size @@ -804,17 +811,18 @@ def group_prod( const uint8_t[:, ::1] mask, uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=0, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - int64float_t val + int64float_t val, nan_val int64float_t[:, ::1] prodx int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) - bint isna_entry, uses_mask = mask is not None + bint isna_entry, isna_result, uses_mask = mask is not None if len_values != len_labels: raise ValueError("len(index) != len(labels)") @@ -823,6 +831,7 @@ def group_prod( prodx = np.ones((out).shape, dtype=(out).base.dtype) N, K = (values).shape + nan_val = _get_na_val(0, False) with nogil: for i in range(N): @@ -836,12 +845,23 @@ def group_prod( if uses_mask: isna_entry = mask[i, j] + isna_result = result_mask[lab, j] else: isna_entry = _treat_as_na(val, False) + isna_result = _treat_as_na(prodx[lab, j], False) + + if not skipna and isna_result: + # If prod is already NA, no need to update it + continue if not isna_entry: nobs[lab, j] += 1 prodx[lab, j] *= val + elif not skipna: + if uses_mask: + result_mask[lab, j] = True + else: + prodx[lab, j] = nan_val _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx @@ -862,6 +882,7 @@ def group_var( uint8_t[:, ::1] result_mask=None, bint is_datetimelike=False, str name="var", + bint skipna=True, ) -> None: cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) @@ -869,7 +890,7 @@ def group_var( floating[:, ::1] mean int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) - bint isna_entry, uses_mask = mask is not None + bint isna_entry, isna_result, uses_mask = mask is not None bint is_std = name == "std" bint is_sem = name == "sem" @@ -898,19 +919,34 @@ def group_var( if uses_mask: isna_entry = mask[i, j] + isna_result = result_mask[lab, j] elif is_datetimelike: # With group_var, we cannot just use _treat_as_na bc # datetimelike dtypes get cast to float64 instead of # to int64. isna_entry = val == NPY_NAT + isna_result = out[lab, j] == NPY_NAT else: isna_entry = _treat_as_na(val, is_datetimelike) + isna_result = _treat_as_na(out[lab, j], is_datetimelike) + + if not skipna and isna_result: + # If aggregate is already NA, don't add to it. This is important for + # datetimelike because adding a value to NPY_NAT may not result + # in a NPY_NAT + continue if not isna_entry: nobs[lab, j] += 1 oldmean = mean[lab, j] mean[lab, j] += (val - oldmean) / nobs[lab, j] out[lab, j] += (val - mean[lab, j]) * (val - oldmean) + elif not skipna: + nobs[lab, j] = 0 + if uses_mask: + result_mask[lab, j] = True + else: + out[lab, j] = NAN for i in range(ncounts): for j in range(K): @@ -1164,7 +1200,7 @@ def group_mean( mean_t[:, ::1] sumx, compensation int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) - bint isna_entry, uses_mask = mask is not None + bint isna_entry, isna_result, uses_mask = mask is not None assert min_count == -1, "'min_count' only used in sum and prod" @@ -1194,25 +1230,24 @@ def group_mean( for j in range(K): val = values[i, j] - if not skipna and ( - (uses_mask and result_mask[lab, j]) or - (is_datetimelike and sumx[lab, j] == NPY_NAT) or - _treat_as_na(sumx[lab, j], False) - ): - # If sum is already NA, don't add to it. This is important for - # datetimelike because adding a value to NPY_NAT may not result - # in NPY_NAT - continue - if uses_mask: isna_entry = mask[i, j] + isna_result = result_mask[lab, j] elif is_datetimelike: # With group_mean, we cannot just use _treat_as_na bc # datetimelike dtypes get cast to float64 instead of # to int64. isna_entry = val == NPY_NAT + isna_result = sumx[lab, j] == NPY_NAT else: isna_entry = _treat_as_na(val, is_datetimelike) + isna_result = _treat_as_na(sumx[lab, j], is_datetimelike) + + if not skipna and isna_result: + # If sum is already NA, don't add to it. This is important for + # datetimelike because adding a value to NPY_NAT may not result + # in NPY_NAT + continue if not isna_entry: nobs[lab, j] += 1 @@ -1806,6 +1841,7 @@ cdef group_min_max( bint compute_max=True, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, + bint skipna=True, ): """ Compute minimum/maximum of columns of `values`, in row groups `labels`. @@ -1833,6 +1869,8 @@ cdef group_min_max( result_mask : ndarray[bool, ndim=2], optional If not None, these specify locations in the output that are NA. Modified in-place. + skipna : bool, default True + If True, ignore nans in `values`. Notes ----- @@ -1841,17 +1879,18 @@ cdef group_min_max( """ cdef: Py_ssize_t i, j, N, K, lab, ngroups = len(counts) - numeric_t val + numeric_t val, nan_val numeric_t[:, ::1] group_min_or_max int64_t[:, ::1] nobs bint uses_mask = mask is not None - bint isna_entry + bint isna_entry, isna_result if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) + nan_val = _get_na_val(0, is_datetimelike) group_min_or_max = np.empty_like(out) group_min_or_max[:] = _get_min_or_max(0, compute_max, is_datetimelike) @@ -1870,8 +1909,15 @@ cdef group_min_max( if uses_mask: isna_entry = mask[i, j] + isna_result = result_mask[lab, j] else: isna_entry = _treat_as_na(val, is_datetimelike) + isna_result = _treat_as_na(group_min_or_max[lab, j], + is_datetimelike) + + if not skipna and isna_result: + # If current min/max is already NA, it will always be NA + continue if not isna_entry: nobs[lab, j] += 1 @@ -1881,6 +1927,11 @@ cdef group_min_max( else: if val < group_min_or_max[lab, j]: group_min_or_max[lab, j] = val + elif not skipna: + if uses_mask: + result_mask[lab, j] = True + else: + group_min_or_max[lab, j] = nan_val _check_below_mincount( out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max @@ -2012,6 +2063,7 @@ def group_max( bint is_datetimelike=False, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, + bint skipna=True, ) -> None: """See group_min_max.__doc__""" group_min_max( @@ -2024,6 +2076,7 @@ def group_max( compute_max=True, mask=mask, result_mask=result_mask, + skipna=skipna, ) @@ -2038,6 +2091,7 @@ def group_min( bint is_datetimelike=False, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, + bint skipna=True, ) -> None: """See group_min_max.__doc__""" group_min_max( @@ -2050,6 +2104,7 @@ def group_min( compute_max=False, mask=mask, result_mask=result_mask, + skipna=skipna, ) diff --git a/pandas/core/_numba/kernels/min_max_.py b/pandas/core/_numba/kernels/min_max_.py index 59d36732ebae6..d56453e4e5abf 100644 --- a/pandas/core/_numba/kernels/min_max_.py +++ b/pandas/core/_numba/kernels/min_max_.py @@ -88,6 +88,7 @@ def grouped_min_max( ngroups: int, min_periods: int, is_max: bool, + skipna: bool = True, ) -> tuple[np.ndarray, list[int]]: N = len(labels) nobs = np.zeros(ngroups, dtype=np.int64) @@ -97,13 +98,16 @@ def grouped_min_max( for i in range(N): lab = labels[i] val = values[i] - if lab < 0: + if lab < 0 or (nobs[lab] >= 1 and np.isnan(output[lab])): continue if values.dtype.kind == "i" or not np.isnan(val): nobs[lab] += 1 else: - # NaN value cannot be a min/max value + if not skipna: + # If skipna is False and we encounter a NaN, + # both min and max of the group will be NaN + output[lab] = np.nan continue if nobs[lab] == 1: diff --git a/pandas/core/_numba/kernels/var_.py b/pandas/core/_numba/kernels/var_.py index 69aec4d6522c4..5d720c877815d 100644 --- a/pandas/core/_numba/kernels/var_.py +++ b/pandas/core/_numba/kernels/var_.py @@ -176,6 +176,7 @@ def grouped_var( ngroups: int, min_periods: int, ddof: int = 1, + skipna: bool = True, ) -> tuple[np.ndarray, list[int]]: N = len(labels) @@ -190,7 +191,11 @@ def grouped_var( lab = labels[i] val = values[i] - if lab < 0: + if lab < 0 or np.isnan(output[lab]): + continue + + if not skipna and np.isnan(val): + output[lab] = np.nan continue mean_x = means[lab] diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f9059e6e8896f..7c3088bea4b76 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2248,7 +2248,7 @@ def mean( return result.__finalize__(self.obj, method="groupby") @final - def median(self, numeric_only: bool = False) -> NDFrameT: + def median(self, numeric_only: bool = False, skipna: bool = True) -> NDFrameT: """ Compute median of groups, excluding missing values. @@ -2263,6 +2263,12 @@ def median(self, numeric_only: bool = False) -> NDFrameT: numeric_only no longer accepts ``None`` and defaults to False. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 3.0.0 + Returns ------- Series or DataFrame @@ -2335,8 +2341,11 @@ def median(self, numeric_only: bool = False) -> NDFrameT: """ result = self._cython_agg_general( "median", - alt=lambda x: Series(x, copy=False).median(numeric_only=numeric_only), + alt=lambda x: Series(x, copy=False).median( + numeric_only=numeric_only, skipna=skipna + ), numeric_only=numeric_only, + skipna=skipna, ) return result.__finalize__(self.obj, method="groupby") @@ -2349,6 +2358,7 @@ def std( engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, numeric_only: bool = False, + skipna: bool = True, ): """ Compute standard deviation of groups, excluding missing values. @@ -2387,6 +2397,12 @@ def std( numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 3.0.0 + Returns ------- Series or DataFrame @@ -2441,14 +2457,16 @@ def std( engine_kwargs, min_periods=0, ddof=ddof, + skipna=skipna, ) ) else: return self._cython_agg_general( "std", - alt=lambda x: Series(x, copy=False).std(ddof=ddof), + alt=lambda x: Series(x, copy=False).std(ddof=ddof, skipna=skipna), numeric_only=numeric_only, ddof=ddof, + skipna=skipna, ) @final @@ -2460,6 +2478,7 @@ def var( engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, numeric_only: bool = False, + skipna: bool = True, ): """ Compute variance of groups, excluding missing values. @@ -2497,6 +2516,12 @@ def var( numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 3.0.0 + Returns ------- Series or DataFrame @@ -2550,13 +2575,15 @@ def var( engine_kwargs, min_periods=0, ddof=ddof, + skipna=skipna, ) else: return self._cython_agg_general( "var", - alt=lambda x: Series(x, copy=False).var(ddof=ddof), + alt=lambda x: Series(x, copy=False).var(ddof=ddof, skipna=skipna), numeric_only=numeric_only, ddof=ddof, + skipna=skipna, ) @final @@ -2686,7 +2713,9 @@ def _value_counts( return result.__finalize__(self.obj, method="value_counts") @final - def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: + def sem( + self, ddof: int = 1, numeric_only: bool = False, skipna: bool = True + ) -> NDFrameT: """ Compute standard error of the mean of groups, excluding missing values. @@ -2706,6 +2735,12 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 3.0.0 + Returns ------- Series or DataFrame @@ -2780,9 +2815,10 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: ) return self._cython_agg_general( "sem", - alt=lambda x: Series(x, copy=False).sem(ddof=ddof), + alt=lambda x: Series(x, copy=False).sem(ddof=ddof, skipna=skipna), numeric_only=numeric_only, ddof=ddof, + skipna=skipna, ) @final @@ -2959,7 +2995,9 @@ def sum( return result @final - def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: + def prod( + self, numeric_only: bool = False, min_count: int = 0, skipna: bool = True + ) -> NDFrameT: """ Compute prod of group values. @@ -2976,6 +3014,12 @@ def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: The required number of valid values to perform the operation. If fewer than ``min_count`` non-NA values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 3.0.0 + Returns ------- Series or DataFrame @@ -3024,17 +3068,22 @@ def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: 2 30 72 """ return self._agg_general( - numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod + numeric_only=numeric_only, + min_count=min_count, + skipna=skipna, + alias="prod", + npfunc=np.prod, ) @final @doc( - _groupby_agg_method_engine_template, + _groupby_agg_method_skipna_engine_template, fname="min", no=False, mc=-1, e=None, ek=None, + s=True, example=dedent( """\ For SeriesGroupBy: @@ -3074,6 +3123,7 @@ def min( self, numeric_only: bool = False, min_count: int = -1, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -3086,23 +3136,26 @@ def min( engine_kwargs, min_periods=min_count, is_max=False, + skipna=skipna, ) else: return self._agg_general( numeric_only=numeric_only, min_count=min_count, + skipna=skipna, alias="min", npfunc=np.min, ) @final @doc( - _groupby_agg_method_engine_template, + _groupby_agg_method_skipna_engine_template, fname="max", no=False, mc=-1, e=None, ek=None, + s=True, example=dedent( """\ For SeriesGroupBy: @@ -3142,6 +3195,7 @@ def max( self, numeric_only: bool = False, min_count: int = -1, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -3154,11 +3208,13 @@ def max( engine_kwargs, min_periods=min_count, is_max=True, + skipna=skipna, ) else: return self._agg_general( numeric_only=numeric_only, min_count=min_count, + skipna=skipna, alias="max", npfunc=np.max, ) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 4b3b7a72b5a5c..1cfc75ea11725 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1269,8 +1269,53 @@ def last( ) @final - @doc(GroupBy.median) def median(self, numeric_only: bool = False): + """ + Compute median of groups, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionchanged:: 2.0.0 + + numeric_only no longer accepts ``None`` and defaults to False. + + Returns + ------- + Series or DataFrame + Median of values within each group. + + See Also + -------- + Series.groupby : Apply a function groupby to a Series. + DataFrame.groupby : Apply a function groupby to each row or column of a + DataFrame. + + Examples + -------- + + >>> ser = pd.Series( + ... [1, 2, 3, 3, 4, 5], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-10", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-10", + ... "2023-02-15", + ... ] + ... ), + ... ) + >>> ser.resample("MS").median() + 2023-01-01 2.0 + 2023-02-01 4.0 + Freq: MS, dtype: float64 + """ return self._downsample("median", numeric_only=numeric_only) @final @@ -1450,12 +1495,61 @@ def var( return self._downsample("var", ddof=ddof, numeric_only=numeric_only) @final - @doc(GroupBy.sem) def sem( self, ddof: int = 1, numeric_only: bool = False, ): + """ + Compute standard error of the mean of groups, excluding missing values. + + For multiple groupings, the result index will be a MultiIndex. + + Parameters + ---------- + ddof : int, default 1 + Degrees of freedom. + + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + .. versionchanged:: 2.0.0 + + numeric_only now defaults to ``False``. + + Returns + ------- + Series or DataFrame + Standard error of the mean of values within each group. + + See Also + -------- + DataFrame.sem : Return unbiased standard error of the mean over requested axis. + Series.sem : Return unbiased standard error of the mean over requested axis. + + Examples + -------- + + >>> ser = pd.Series( + ... [1, 3, 2, 4, 3, 8], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-10", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-10", + ... "2023-02-15", + ... ] + ... ), + ... ) + >>> ser.resample("MS").sem() + 2023-01-01 0.577350 + 2023-02-01 1.527525 + Freq: MS, dtype: float64 + """ return self._downsample("sem", ddof=ddof, numeric_only=numeric_only) @final diff --git a/pandas/tests/groupby/aggregate/test_numba.py b/pandas/tests/groupby/aggregate/test_numba.py index ca265a1d1108b..0cd8a14d97eb0 100644 --- a/pandas/tests/groupby/aggregate/test_numba.py +++ b/pandas/tests/groupby/aggregate/test_numba.py @@ -186,7 +186,7 @@ def test_multifunc_numba_vs_cython_frame(agg_kwargs): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("func", ["sum", "mean"]) +@pytest.mark.parametrize("func", ["sum", "mean", "var", "std", "min", "max"]) def test_multifunc_numba_vs_cython_frame_noskipna(func): pytest.importorskip("numba") data = DataFrame( diff --git a/pandas/tests/groupby/test_api.py b/pandas/tests/groupby/test_api.py index cc69de2581a79..215e627abb018 100644 --- a/pandas/tests/groupby/test_api.py +++ b/pandas/tests/groupby/test_api.py @@ -174,16 +174,13 @@ def test_frame_consistency(groupby_func): elif groupby_func in ("nunique",): exclude_expected = {"axis"} elif groupby_func in ("max", "min"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} exclude_result = {"min_count", "engine", "engine_kwargs"} - elif groupby_func in ("sum", "mean"): + elif groupby_func in ("sum", "mean", "std", "var"): exclude_expected = {"axis", "kwargs"} exclude_result = {"engine", "engine_kwargs"} - elif groupby_func in ("std", "var"): - exclude_expected = {"axis", "kwargs", "skipna"} - exclude_result = {"engine", "engine_kwargs"} elif groupby_func in ("median", "prod", "sem"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} elif groupby_func in ("bfill", "ffill"): exclude_expected = {"inplace", "axis", "limit_area"} elif groupby_func in ("cummax", "cummin"): @@ -235,16 +232,13 @@ def test_series_consistency(request, groupby_func): if groupby_func in ("any", "all"): exclude_expected = {"kwargs", "bool_only", "axis"} elif groupby_func in ("max", "min"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} exclude_result = {"min_count", "engine", "engine_kwargs"} - elif groupby_func in ("sum", "mean"): + elif groupby_func in ("sum", "mean", "std", "var"): exclude_expected = {"axis", "kwargs"} exclude_result = {"engine", "engine_kwargs"} - elif groupby_func in ("std", "var"): - exclude_expected = {"axis", "kwargs", "skipna"} - exclude_result = {"engine", "engine_kwargs"} elif groupby_func in ("median", "prod", "sem"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} elif groupby_func in ("bfill", "ffill"): exclude_expected = {"inplace", "axis", "limit_area"} elif groupby_func in ("cummax", "cummin"): diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 1db12f05e821f..ea876cfdf4933 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -514,6 +514,147 @@ def test_sum_skipna_object(skipna): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "func, values, dtype, result_dtype", + [ + ("prod", [0, 1, 3, np.nan, 4, 5, 6, 7, -8, 9], "float64", "float64"), + ("prod", [0, -1, 3, 4, 5, np.nan, 6, 7, 8, 9], "Float64", "Float64"), + ("prod", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Int64", "Int64"), + ("prod", [np.nan] * 10, "float64", "float64"), + ("prod", [np.nan] * 10, "Float64", "Float64"), + ("prod", [np.nan] * 10, "Int64", "Int64"), + ("var", [0, -1, 3, 4, np.nan, 5, 6, 7, 8, 9], "float64", "float64"), + ("var", [0, 1, 3, -4, 5, 6, 7, -8, 9, np.nan], "Float64", "Float64"), + ("var", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "Int64", "Float64"), + ("var", [np.nan] * 10, "float64", "float64"), + ("var", [np.nan] * 10, "Float64", "Float64"), + ("var", [np.nan] * 10, "Int64", "Float64"), + ("std", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "float64", "float64"), + ("std", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "Float64", "Float64"), + ("std", [0, 1, 3, -4, 5, 6, 7, -8, 9, np.nan], "Int64", "Float64"), + ("std", [np.nan] * 10, "float64", "float64"), + ("std", [np.nan] * 10, "Float64", "Float64"), + ("std", [np.nan] * 10, "Int64", "Float64"), + ("sem", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"), + ("sem", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Float64", "Float64"), + ("sem", [0, -1, 3, 4, 5, -6, 7, 8, 9, np.nan], "Int64", "Float64"), + ("sem", [np.nan] * 10, "float64", "float64"), + ("sem", [np.nan] * 10, "Float64", "Float64"), + ("sem", [np.nan] * 10, "Int64", "Float64"), + ("min", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"), + ("min", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Float64", "Float64"), + ("min", [0, -1, 3, 4, 5, -6, 7, 8, 9, np.nan], "Int64", "Int64"), + ( + "min", + [0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], + "timedelta64[ns]", + "timedelta64[ns]", + ), + ( + "min", + pd.to_datetime( + [ + "2019-05-09", + pd.NaT, + "2019-05-11", + "2019-05-12", + "2019-05-13", + "2019-05-14", + "2019-05-15", + "2019-05-16", + "2019-05-17", + "2019-05-18", + ] + ), + "datetime64[ns]", + "datetime64[ns]", + ), + ("min", [np.nan] * 10, "float64", "float64"), + ("min", [np.nan] * 10, "Float64", "Float64"), + ("min", [np.nan] * 10, "Int64", "Int64"), + ("max", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"), + ("max", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Float64", "Float64"), + ("max", [0, -1, 3, 4, 5, -6, 7, 8, 9, np.nan], "Int64", "Int64"), + ( + "max", + [0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], + "timedelta64[ns]", + "timedelta64[ns]", + ), + ( + "max", + pd.to_datetime( + [ + "2019-05-09", + pd.NaT, + "2019-05-11", + "2019-05-12", + "2019-05-13", + "2019-05-14", + "2019-05-15", + "2019-05-16", + "2019-05-17", + "2019-05-18", + ] + ), + "datetime64[ns]", + "datetime64[ns]", + ), + ("max", [np.nan] * 10, "float64", "float64"), + ("max", [np.nan] * 10, "Float64", "Float64"), + ("max", [np.nan] * 10, "Int64", "Int64"), + ("median", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"), + ("median", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "Float64", "Float64"), + ("median", [0, -1, 3, 4, 5, -6, 7, 8, 9, np.nan], "Int64", "Float64"), + ( + "median", + [0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], + "timedelta64[ns]", + "timedelta64[ns]", + ), + ( + "median", + pd.to_datetime( + [ + "2019-05-09", + pd.NaT, + "2019-05-11", + "2019-05-12", + "2019-05-13", + "2019-05-14", + "2019-05-15", + "2019-05-16", + "2019-05-17", + "2019-05-18", + ] + ), + "datetime64[ns]", + "datetime64[ns]", + ), + ("median", [np.nan] * 10, "float64", "float64"), + ("median", [np.nan] * 10, "Float64", "Float64"), + ("median", [np.nan] * 10, "Int64", "Float64"), + ], +) +def test_multifunc_skipna(func, values, dtype, result_dtype, skipna): + # GH#15675 + df = DataFrame( + { + "val": values, + "cat": ["A", "B"] * 5, + } + ).astype({"val": dtype}) + # We need to recast the expected values to the result_dtype as some operations + # change the dtype + expected = ( + df.groupby("cat")["val"] + .apply(lambda x: getattr(x, func)(skipna=skipna)) + .astype(result_dtype) + ) + result = getattr(df.groupby("cat")["val"], func)(skipna=skipna) + tm.assert_series_equal(result, expected) + + def test_cython_median(): arr = np.random.default_rng(2).standard_normal(1000) arr[::2] = np.nan From e4f6270a7b9338c439a6352fca8029be26d8e211 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Feb 2025 23:15:34 +0530 Subject: [PATCH 05/68] DOC: fix ES01 for pandas.reset_option (#60834) --- pandas/_config/config.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 35139979f92fe..0d06e6fa8e96c 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -321,6 +321,11 @@ def reset_option(pat: str) -> None: """ Reset one or more options to their default value. + This method resets the specified pandas option(s) back to their default + values. It allows partial string matching for convenience, but users should + exercise caution to avoid unintended resets due to changes in option names + in future versions. + Parameters ---------- pat : str/regex From 2a49a4f218c3819e128cd1c8ea7fc9c1f2bdf92b Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Feb 2025 23:16:08 +0530 Subject: [PATCH 06/68] DOC: fix ES01 for pandas.core.resample.Resampler.indices (#60835) --- pandas/core/groupby/groupby.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7c3088bea4b76..549e76ebc15eb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -570,6 +570,13 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: """ Dict {group name -> group indices}. + The dictionary keys represent the group labels (e.g., timestamps for a + time-based resampling operation), and the values are arrays of integer + positions indicating where the elements of each group are located in the + original data. This property is particularly useful when working with + resampled data, as it provides insight into how the original time-series data + has been grouped. + See Also -------- core.groupby.DataFrameGroupBy.indices : Provides a mapping of group rows to From 569f94da9ecf0cd7c5eb565f5041b883726f6d3a Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Feb 2025 23:16:36 +0530 Subject: [PATCH 07/68] DOC: fix ES01 for pandas.DataFrame.columns (#60836) --- pandas/core/frame.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3669d8249dd27..d9f7623064e05 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -13673,6 +13673,10 @@ def isin_(x): doc=""" The column labels of the DataFrame. + This property holds the column names as a pandas ``Index`` object. + It provides an immutable sequence of column labels that can be + used for data selection, renaming, and alignment in DataFrame operations. + Returns ------- pandas.Index From 4f664f156badac017c3775242559953a4da50b40 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 3 Feb 2025 23:17:10 +0530 Subject: [PATCH 08/68] DOC: fix ES01 for pandas.Series.array (#60837) --- pandas/core/base.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/base.py b/pandas/core/base.py index 61a7c079d87f8..a64cd8633c1db 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -506,6 +506,11 @@ def array(self) -> ExtensionArray: """ The ExtensionArray of the data backing this Series or Index. + This property provides direct access to the underlying array data of a + Series or Index without requiring conversion to a NumPy array. It + returns an ExtensionArray, which is the native storage format for + pandas extension dtypes. + Returns ------- ExtensionArray From 3bd27ffa296398c974c19571ccacd1eea76ca034 Mon Sep 17 00:00:00 2001 From: Florian Bourgey Date: Mon, 3 Feb 2025 12:51:31 -0500 Subject: [PATCH 09/68] DOC: Update parameter descriptions in `cut` function for clarity (#60839) --- pandas/core/reshape/tile.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index b3f946f289891..034b861a83f43 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -73,7 +73,7 @@ def cut( Parameters ---------- - x : array-like + x : 1d ndarray or Series The input array to be binned. Must be 1-dimensional. bins : int, sequence of scalars, or IntervalIndex The criteria to bin by. @@ -126,7 +126,7 @@ def cut( Categorical for all other inputs. The values stored within are whatever the type in the sequence is. - * False : returns an ndarray of integers. + * False : returns a 1d ndarray or Series of integers. bins : numpy.ndarray or IntervalIndex. The computed or specified bins. Only returned when `retbins=True`. From c6fc6d0d7978f3958264fd372f56edf686614dac Mon Sep 17 00:00:00 2001 From: SebastianOuslis Date: Mon, 3 Feb 2025 12:53:01 -0500 Subject: [PATCH 10/68] DOC: Closed parameter not intuitively documented in DataFrame.rolling (#60832) * change docs * format * format --- pandas/core/groupby/groupby.py | 17 ++++++++++++----- pandas/core/window/rolling.py | 17 ++++++++++++----- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 549e76ebc15eb..9c27df4ed8c1b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3717,14 +3717,21 @@ def rolling( an integer index is not used to calculate the rolling window. closed : str, default None - If ``'right'``, the first point in the window is excluded from calculations. + Determines the inclusivity of points in the window + If ``'right'``, (First, Last] the last point in the window + is included in the calculations. - If ``'left'``, the last point in the window is excluded from calculations. + If ``'left'``, [First, Last) the first point in the window + is included in the calculations. - If ``'both'``, no points in the window are excluded from calculations. + If ``'both'``, [First, Last] all points in the window + are included in the calculations. - If ``'neither'``, the first and last points in the window are excluded - from calculations. + If ``'neither'``, (First, Last) the first and last points + in the window are excludedfrom calculations. + + () and [] are referencing open and closed set + notation respetively. Default ``None`` (``'right'``). diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 631ab15464942..b954ce2584c13 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -929,14 +929,21 @@ class Window(BaseWindow): an integer index is not used to calculate the rolling window. closed : str, default None - If ``'right'``, the first point in the window is excluded from calculations. + Determines the inclusivity of points in the window + If ``'right'``, (First, Last] the last point in the window + is included in the calculations. - If ``'left'``, the last point in the window is excluded from calculations. + If ``'left'``, [First, Last) the first point in the window + is included in the calculations. - If ``'both'``, no point in the window is excluded from calculations. + If ``'both'``, [First, Last] all points in the window + are included in the calculations. - If ``'neither'``, the first and last points in the window are excluded - from calculations. + If ``'neither'``, (First, Last) the first and last points + in the window are excludedfrom calculations. + + () and [] are referencing open and closed set + notation respetively. Default ``None`` (``'right'``). From e58bf26fa4d806f40624fb80d8321f2cc43d62a1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 3 Feb 2025 10:08:43 -0800 Subject: [PATCH 11/68] CI: Update some CI configurations (#60762) * CI: Update some CI configurations * Freeze Python dev * Add actions-313.yaml * Add 3.13 yaml * Move to pyside6 instead of pyqt * Revert "Move to pyside6 instead of pyqt" This reverts commit c04039fff983db3a94f42e7e16c79cd824672757. * Revert "Add 3.13 yaml" This reverts commit 0f888e1476da8f46cacaf6e63b4a5cfc2a1a8365. * Revert "Add actions-313.yaml" This reverts commit 91e27037785cce2eb47e05b3ef726dd16e14f2bf. * Revert "Freeze Python dev" This reverts commit c685af4d5871c2ce455d81f8bf212dc0e2e31aa9. * Move back to python 3.13 dev --- .github/workflows/unit-tests.yml | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 842629ba331d6..08c41a1eeb21f 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -107,7 +107,7 @@ jobs: services: mysql: - image: mysql:8 + image: mysql:9 env: MYSQL_ALLOW_EMPTY_PASSWORD: yes MYSQL_DATABASE: pandas @@ -120,7 +120,7 @@ jobs: - 3306:3306 postgres: - image: postgres:16 + image: postgres:17 env: PGUSER: postgres POSTGRES_USER: postgres @@ -135,7 +135,7 @@ jobs: - 5432:5432 moto: - image: motoserver/moto:5.0.0 + image: motoserver/moto:5.0.27 env: AWS_ACCESS_KEY_ID: foobar_key AWS_SECRET_ACCESS_KEY: foobar_secret @@ -242,15 +242,14 @@ jobs: - name: Build environment and Run Tests # https://github.com/numpy/numpy/issues/24703#issuecomment-1722379388 run: | - /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev + /opt/python/cp313-cp313/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install numpy -Csetup-args="-Dallow-noblas=true" python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" python -m pip list --no-cache-dir - export PANDAS_CI=1 - python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml + PANDAS_CI=1 python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-32bit @@ -259,7 +258,7 @@ jobs: Linux-Musl: runs-on: ubuntu-22.04 container: - image: quay.io/pypa/musllinux_1_1_x86_64 + image: quay.io/pypa/musllinux_1_2_x86_64 steps: - name: Checkout pandas Repo # actions/checkout does not work since it requires node @@ -281,7 +280,7 @@ jobs: apk add musl-locales - name: Build environment run: | - /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev + /opt/python/cp313-cp313/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 @@ -291,8 +290,7 @@ jobs: - name: Run Tests run: | . ~/virtualenvs/pandas-dev/bin/activate - export PANDAS_CI=1 - python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml + PANDAS_CI=1 python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-musl @@ -357,8 +355,7 @@ jobs: python --version python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy - python -m pip install versioneer[toml] - python -m pip install python-dateutil tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov + python -m pip install versioneer[toml] python-dateutil tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov python -m pip install -ve . --no-build-isolation --no-index --no-deps -Csetup-args="--werror" python -m pip list @@ -375,7 +372,7 @@ jobs: concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-python-freethreading-dev + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-python-freethreading-dev cancel-in-progress: true env: @@ -396,14 +393,11 @@ jobs: nogil: true - name: Build Environment - # TODO: Once numpy 2.2.1 is out, don't install nightly version - # Tests segfault with numpy 2.2.0: https://github.com/numpy/numpy/pull/27955 run: | python --version - python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 - python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython numpy - python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz tzdata hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov + python -m pip install --upgrade pip setuptools wheel numpy meson[ninja]==1.2.1 meson-python==0.13.1 + python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython + python -m pip install versioneer[toml] python-dateutil pytz tzdata hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov python -m pip install -ve . --no-build-isolation --no-index --no-deps -Csetup-args="--werror" python -m pip list From e84a7f7b521d52812b227d9dab038f138373866f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 3 Feb 2025 11:20:07 -0800 Subject: [PATCH 12/68] [pre-commit.ci] pre-commit autoupdate (#60840) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [pre-commit.ci] pre-commit autoupdate updates: - [github.com/astral-sh/ruff-pre-commit: v0.8.6 → v0.9.4](https://github.com/astral-sh/ruff-pre-commit/compare/v0.8.6...v0.9.4) - [github.com/codespell-project/codespell: v2.3.0 → v2.4.1](https://github.com/codespell-project/codespell/compare/v2.3.0...v2.4.1) - [github.com/PyCQA/isort: 5.13.2 → 6.0.0](https://github.com/PyCQA/isort/compare/5.13.2...6.0.0) - [github.com/pre-commit/mirrors-clang-format: v19.1.6 → v19.1.7](https://github.com/pre-commit/mirrors-clang-format/compare/v19.1.6...v19.1.7) - [github.com/trim21/pre-commit-mirror-meson: v1.6.1 → v1.7.0](https://github.com/trim21/pre-commit-mirror-meson/compare/v1.6.1...v1.7.0) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Address ruff/codespell failures * Run ruff again --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .pre-commit-config.yaml | 10 ++++---- asv_bench/benchmarks/io/style.py | 4 ++-- doc/make.py | 6 ++--- doc/source/user_guide/style.ipynb | 2 +- pandas/core/apply.py | 3 +-- pandas/core/arrays/base.py | 6 +++-- pandas/core/arrays/datetimes.py | 3 +-- pandas/core/computation/eval.py | 2 +- pandas/core/computation/expr.py | 2 +- pandas/core/computation/ops.py | 3 +-- pandas/core/dtypes/cast.py | 2 +- pandas/core/dtypes/dtypes.py | 3 +-- pandas/core/generic.py | 3 +-- pandas/core/groupby/groupby.py | 3 +-- pandas/core/groupby/grouper.py | 3 +-- pandas/core/indexers/objects.py | 6 ++--- pandas/core/indexing.py | 12 ++++------ pandas/core/interchange/buffer.py | 3 +-- pandas/core/internals/blocks.py | 3 +-- pandas/core/internals/construction.py | 3 +-- pandas/core/ops/array_ops.py | 2 +- pandas/core/reshape/encoding.py | 3 +-- pandas/core/reshape/merge.py | 6 ++--- pandas/core/tools/datetimes.py | 6 ++--- pandas/io/excel/_odswriter.py | 2 +- pandas/io/formats/printing.py | 4 ++-- pandas/io/formats/style.py | 12 ++++++---- pandas/io/formats/style_render.py | 11 ++++----- pandas/io/formats/xml.py | 6 ++--- pandas/io/json/_json.py | 2 +- pandas/io/parsers/base_parser.py | 3 +-- pandas/io/parsers/python_parser.py | 6 ++--- pandas/io/parsers/readers.py | 6 ++--- pandas/io/sas/sas_xport.py | 9 +++----- pandas/plotting/_core.py | 23 ++++++++++++------- pandas/plotting/_matplotlib/boxplot.py | 3 +-- pandas/tests/arrays/interval/test_formats.py | 4 +--- pandas/tests/dtypes/cast/test_downcast.py | 4 ++-- pandas/tests/dtypes/test_dtypes.py | 3 +-- pandas/tests/dtypes/test_missing.py | 2 +- pandas/tests/extension/base/getitem.py | 4 ++-- pandas/tests/extension/json/array.py | 3 +-- pandas/tests/extension/list/array.py | 3 +-- pandas/tests/extension/test_arrow.py | 3 +-- pandas/tests/frame/methods/test_info.py | 2 +- pandas/tests/frame/methods/test_sample.py | 3 +-- pandas/tests/frame/methods/test_set_axis.py | 2 +- pandas/tests/groupby/test_categorical.py | 2 +- pandas/tests/groupby/test_raises.py | 5 +--- .../indexes/categorical/test_indexing.py | 6 ++--- .../indexes/datetimes/methods/test_round.py | 6 ++--- .../tests/indexes/datetimes/test_formats.py | 13 ++--------- .../tests/indexes/datetimes/test_indexing.py | 6 ++--- .../indexes/interval/test_constructors.py | 9 +++----- pandas/tests/indexes/interval/test_formats.py | 7 +----- pandas/tests/indexes/multi/test_indexing.py | 3 +-- pandas/tests/indexes/numeric/test_indexing.py | 3 +-- pandas/tests/indexes/period/test_formats.py | 3 +-- pandas/tests/indexes/period/test_indexing.py | 3 +-- pandas/tests/indexes/test_base.py | 3 +-- pandas/tests/indexes/test_index_new.py | 3 +-- .../tests/indexes/timedeltas/test_indexing.py | 3 +-- pandas/tests/indexing/test_iloc.py | 3 +-- pandas/tests/io/excel/test_readers.py | 3 +-- pandas/tests/io/excel/test_style.py | 6 ++--- pandas/tests/io/formats/style/test_style.py | 2 +- pandas/tests/io/formats/test_css.py | 3 +-- pandas/tests/io/formats/test_to_csv.py | 5 +--- pandas/tests/io/formats/test_to_html.py | 3 +-- pandas/tests/io/formats/test_to_markdown.py | 6 ++--- pandas/tests/io/formats/test_to_string.py | 18 +++------------ pandas/tests/io/json/test_pandas.py | 12 +++------- pandas/tests/io/json/test_readlines.py | 9 ++++---- pandas/tests/io/json/test_ujson.py | 4 ++-- .../io/parser/common/test_read_errors.py | 3 +-- pandas/tests/io/parser/test_mangle_dupes.py | 2 +- pandas/tests/io/parser/test_parse_dates.py | 4 ++-- pandas/tests/io/xml/test_xml.py | 3 +-- pandas/tests/plotting/test_series.py | 2 +- pandas/tests/resample/test_time_grouper.py | 2 +- .../tests/reshape/merge/test_merge_cross.py | 6 ++--- .../scalar/timedelta/test_constructors.py | 3 +-- pandas/tests/series/methods/test_between.py | 3 +-- pandas/tests/tools/test_to_datetime.py | 4 ++-- pandas/tests/tools/test_to_numeric.py | 6 ++--- pandas/tests/tseries/offsets/test_offsets.py | 6 ++--- pandas/tests/tseries/offsets/test_ticks.py | 3 +-- pandas/tests/tslibs/test_parsing.py | 5 +--- pandas/tseries/frequencies.py | 3 +-- pyproject.toml | 2 +- 90 files changed, 165 insertions(+), 260 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1dd8dfc54111e..77bcadf57dd2d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ ci: skip: [pyright, mypy] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.8.6 + rev: v0.9.4 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -41,7 +41,7 @@ repos: pass_filenames: true require_serial: false - repo: https://github.com/codespell-project/codespell - rev: v2.3.0 + rev: v2.4.1 hooks: - id: codespell types_or: [python, rst, markdown, cython, c] @@ -70,7 +70,7 @@ repos: - id: trailing-whitespace args: [--markdown-linebreak-ext=md] - repo: https://github.com/PyCQA/isort - rev: 5.13.2 + rev: 6.0.0 hooks: - id: isort - repo: https://github.com/asottile/pyupgrade @@ -95,14 +95,14 @@ repos: - id: sphinx-lint args: ["--enable", "all", "--disable", "line-too-long"] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v19.1.6 + rev: v19.1.7 hooks: - id: clang-format files: ^pandas/_libs/src|^pandas/_libs/include args: [-i] types_or: [c, c++] - repo: https://github.com/trim21/pre-commit-mirror-meson - rev: v1.6.1 + rev: v1.7.0 hooks: - id: meson-fmt args: ['--inplace'] diff --git a/asv_bench/benchmarks/io/style.py b/asv_bench/benchmarks/io/style.py index 24fd8a0d20aba..0486cabb29845 100644 --- a/asv_bench/benchmarks/io/style.py +++ b/asv_bench/benchmarks/io/style.py @@ -13,8 +13,8 @@ class Render: def setup(self, cols, rows): self.df = DataFrame( np.random.randn(rows, cols), - columns=[f"float_{i+1}" for i in range(cols)], - index=[f"row_{i+1}" for i in range(rows)], + columns=[f"float_{i + 1}" for i in range(cols)], + index=[f"row_{i + 1}" for i in range(rows)], ) def time_apply_render(self, cols, rows): diff --git a/doc/make.py b/doc/make.py index 02deb5002fea1..9542563dc037b 100755 --- a/doc/make.py +++ b/doc/make.py @@ -260,8 +260,7 @@ def latex(self, force=False): for i in range(3): self._run_os("pdflatex", "-interaction=nonstopmode", "pandas.tex") raise SystemExit( - "You should check the file " - '"build/latex/pandas.pdf" for problems.' + 'You should check the file "build/latex/pandas.pdf" for problems.' ) self._run_os("make") return ret_code @@ -343,8 +342,7 @@ def main(): dest="verbosity", default=0, help=( - "increase verbosity (can be repeated), " - "passed to the sphinx build command" + "increase verbosity (can be repeated), passed to the sphinx build command" ), ) argparser.add_argument( diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index abb7181fc8d72..9cda1486eb48b 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -1288,7 +1288,7 @@ "outputs": [], "source": [ "df2.loc[:4].style.highlight_max(\n", - " axis=1, props=(\"color:white; \" \"font-weight:bold; \" \"background-color:darkblue;\")\n", + " axis=1, props=(\"color:white; font-weight:bold; background-color:darkblue;\")\n", ")" ] }, diff --git a/pandas/core/apply.py b/pandas/core/apply.py index af513d49bcfe0..f36fc82fb1a11 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1645,8 +1645,7 @@ def reconstruct_func( # GH 28426 will raise error if duplicated function names are used and # there is no reassigned name raise SpecificationError( - "Function names must be unique if there is no new column names " - "assigned" + "Function names must be unique if there is no new column names assigned" ) if func is None: # nicer error message diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index e831883998098..33745438e2aea 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1791,9 +1791,11 @@ def take(self, indices, allow_fill=False, fill_value=None): # type for the array, to the physical storage type for # the data, before passing to take. - result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill) + result = take( + data, indices, fill_value=fill_value, allow_fill=allow_fill + ) return self._from_sequence(result, dtype=self.dtype) - """ # noqa: E501 + """ # Implementer note: The `fill_value` parameter should be a user-facing # value, an instance of self.dtype.type. When passed `fill_value=None`, # the default of `self.dtype.na_value` should be used. diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 43cc492f82885..df40c9c11b117 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2707,8 +2707,7 @@ def _maybe_infer_tz(tz: tzinfo | None, inferred_tz: tzinfo | None) -> tzinfo | N pass elif not timezones.tz_compare(tz, inferred_tz): raise TypeError( - f"data is already tz-aware {inferred_tz}, unable to " - f"set specified tz: {tz}" + f"data is already tz-aware {inferred_tz}, unable to set specified tz: {tz}" ) return tz diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 9d844e590582a..f8e3200ef2ba0 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -204,7 +204,7 @@ def eval( By default, with the numexpr engine, the following operations are supported: - - Arthimetic operations: ``+``, ``-``, ``*``, ``/``, ``**``, ``%`` + - Arithmetic operations: ``+``, ``-``, ``*``, ``/``, ``**``, ``%`` - Boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not) - Comparison operators: ``<``, ``<=``, ``==``, ``!=``, ``>=``, ``>`` diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 010fad1bbf0b6..14a393b02409c 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -698,7 +698,7 @@ def visit_Call(self, node, side=None, **kwargs): if not isinstance(key, ast.keyword): # error: "expr" has no attribute "id" raise ValueError( - "keyword error in function call " f"'{node.func.id}'" # type: ignore[attr-defined] + f"keyword error in function call '{node.func.id}'" # type: ignore[attr-defined] ) if key.arg: diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index 9b26de42e119b..f06ded6d9f98e 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -512,8 +512,7 @@ def __init__(self, op: Literal["+", "-", "~", "not"], operand) -> None: self.func = _unary_ops_dict[op] except KeyError as err: raise ValueError( - f"Invalid unary operator {op!r}, " - f"valid operators are {UNARY_OPS_SYMS}" + f"Invalid unary operator {op!r}, valid operators are {UNARY_OPS_SYMS}" ) from err def __call__(self, env) -> MathCall: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 02b9291da9b31..94531c2ac87e8 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1651,7 +1651,7 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n # (test_constructor_coercion_signed_to_unsigned) so safe to ignore. warnings.filterwarnings( "ignore", - "NumPy will stop allowing conversion of " "out-of-bound Python int", + "NumPy will stop allowing conversion of out-of-bound Python int", DeprecationWarning, ) casted = np.asarray(arr, dtype=dtype) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 1eb1a630056a2..d8dd6441913b5 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -605,8 +605,7 @@ def update_dtype(self, dtype: str_type | CategoricalDtype) -> CategoricalDtype: return self elif not self.is_dtype(dtype): raise ValueError( - f"a CategoricalDtype must be passed to perform an update, " - f"got {dtype!r}" + f"a CategoricalDtype must be passed to perform an update, got {dtype!r}" ) else: # from here on, dtype is a CategoricalDtype diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e0a4f9d9c546a..f376518d4d3b8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5537,8 +5537,7 @@ def filter( nkw = common.count_not_none(items, like, regex) if nkw > 1: raise TypeError( - "Keyword arguments `items`, `like`, or `regex` " - "are mutually exclusive" + "Keyword arguments `items`, `like`, or `regex` are mutually exclusive" ) if axis is None: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9c27df4ed8c1b..fdf2aab434695 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2632,8 +2632,7 @@ def _value_counts( doesnt_exist = subsetted - unique_cols if doesnt_exist: raise ValueError( - f"Keys {doesnt_exist} in subset do not " - f"exist in the DataFrame." + f"Keys {doesnt_exist} in subset do not exist in the DataFrame." ) else: subsetted = unique_cols diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 5f9ebdcea4a2d..c9d874fc08dbe 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -516,8 +516,7 @@ def __init__( ): grper = pprint_thing(grouping_vector) errmsg = ( - "Grouper result violates len(labels) == " - f"len(data)\nresult: {grper}" + f"Grouper result violates len(labels) == len(data)\nresult: {grper}" ) raise AssertionError(errmsg) diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index 0064aa91056e8..88379164534f2 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -478,9 +478,9 @@ def get_window_bounds( ) start = start.astype(np.int64) end = end.astype(np.int64) - assert len(start) == len( - end - ), "these should be equal in length from get_window_bounds" + assert len(start) == len(end), ( + "these should be equal in length from get_window_bounds" + ) # Cannot use groupby_indices as they might not be monotonic with the object # we're rolling over window_indices = np.arange( diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 656ee54cbc5d4..8a493fef54d3b 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -975,8 +975,7 @@ def _validate_tuple_indexer(self, key: tuple) -> tuple: self._validate_key(k, i) except ValueError as err: raise ValueError( - "Location based indexing can only have " - f"[{self._valid_types}] types" + f"Location based indexing can only have [{self._valid_types}] types" ) from err return key @@ -1589,8 +1588,7 @@ def _validate_key(self, key, axis: AxisInt) -> None: "is not available" ) raise ValueError( - "iLocation based boolean indexing cannot use " - "an indexable as a mask" + "iLocation based boolean indexing cannot use an indexable as a mask" ) return @@ -1994,8 +1992,7 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): return self._setitem_with_indexer((pi, info_axis[0]), value[0]) raise ValueError( - "Must have equal len keys and value " - "when setting with an iterable" + "Must have equal len keys and value when setting with an iterable" ) elif lplane_indexer == 0 and len(value) == len(self.obj.index): @@ -2023,8 +2020,7 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): else: raise ValueError( - "Must have equal len keys and value " - "when setting with an iterable" + "Must have equal len keys and value when setting with an iterable" ) else: diff --git a/pandas/core/interchange/buffer.py b/pandas/core/interchange/buffer.py index 62bf396256f2a..8953360a91c8e 100644 --- a/pandas/core/interchange/buffer.py +++ b/pandas/core/interchange/buffer.py @@ -31,8 +31,7 @@ def __init__(self, x: np.ndarray, allow_copy: bool = True) -> None: x = x.copy() else: raise RuntimeError( - "Exports cannot be zero-copy in the case " - "of a non-contiguous buffer" + "Exports cannot be zero-copy in the case of a non-contiguous buffer" ) # Store the numpy array in which the data resides as a private diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f44ad926dda5c..d1a9081b234de 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2264,8 +2264,7 @@ def check_ndim(values, placement: BlockPlacement, ndim: int) -> None: if values.ndim > ndim: # Check for both np.ndarray and ExtensionArray raise ValueError( - "Wrong number of dimensions. " - f"values.ndim > ndim [{values.ndim} > {ndim}]" + f"Wrong number of dimensions. values.ndim > ndim [{values.ndim} > {ndim}]" ) if not is_1d_only_ea_dtype(values.dtype): diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index dfff34656f82b..69da2be0306f6 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -907,8 +907,7 @@ def _validate_or_indexify_columns( if not is_mi_list and len(columns) != len(content): # pragma: no cover # caller's responsibility to check for this... raise AssertionError( - f"{len(columns)} columns passed, passed data had " - f"{len(content)} columns" + f"{len(columns)} columns passed, passed data had {len(content)} columns" ) if is_mi_list: # check if nested list column, length of each sub-list should be equal diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 983a3df57e369..3a466b6fc7fc8 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -164,7 +164,7 @@ def _masked_arith_op(x: np.ndarray, y, op) -> np.ndarray: else: if not is_scalar(y): raise TypeError( - f"Cannot broadcast np.ndarray with operand of type { type(y) }" + f"Cannot broadcast np.ndarray with operand of type {type(y)}" ) # mask is only meaningful for x diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 33ff182f5baee..6a590ee5b227e 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -495,8 +495,7 @@ def from_dummies( if col_isna_mask.any(): raise ValueError( - "Dummy DataFrame contains NA value in column: " - f"'{col_isna_mask.idxmax()}'" + f"Dummy DataFrame contains NA value in column: '{col_isna_mask.idxmax()}'" ) # index data with a list of all columns that are dummies diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 5fddd9f9aca5b..ab056c8cc7e37 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1929,9 +1929,9 @@ def get_join_indexers( np.ndarray[np.intp] or None Indexer into the right_keys. """ - assert len(left_keys) == len( - right_keys - ), "left_keys and right_keys must be the same length" + assert len(left_keys) == len(right_keys), ( + "left_keys and right_keys must be the same length" + ) # fast-path for empty left/right left_n = len(left_keys[0]) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 30487de7bafd5..0a10001a3113f 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -192,9 +192,9 @@ def should_cache( else: check_count = 500 else: - assert ( - 0 <= check_count <= len(arg) - ), "check_count must be in next bounds: [0; len(arg)]" + assert 0 <= check_count <= len(arg), ( + "check_count must be in next bounds: [0; len(arg)]" + ) if check_count == 0: return False diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index 10a06aec72a57..ba4919c9298ed 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -270,7 +270,7 @@ def _process_style(self, style: dict[str, Any] | None) -> str | None: style_key = json.dumps(style) if style_key in self._style_dict: return self._style_dict[style_key] - name = f"pd{len(self._style_dict)+1}" + name = f"pd{len(self._style_dict) + 1}" self._style_dict[style_key] = name odf_style = Style(name=name, family="table-cell") if "font" in style: diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index a9936ba8c8f2c..b466e986450b1 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -336,8 +336,8 @@ def format_object_summary( if indent_for_name: name_len = len(name) - space1 = f'\n{(" " * (name_len + 1))}' - space2 = f'\n{(" " * (name_len + 2))}' + space1 = f"\n{(' ' * (name_len + 1))}" + space2 = f"\n{(' ' * (name_len + 2))}" else: space1 = "\n" space2 = "\n " # space for the opening '[' diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 3f37556867954..b4c55da3eddd6 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -2588,7 +2588,7 @@ def set_sticky( for i, level in enumerate(levels_): styles.append( { - "selector": f"thead tr:nth-child({level+1}) th", + "selector": f"thead tr:nth-child({level + 1}) th", "props": props + ( f"top:{i * pixel_size}px; height:{pixel_size}px; " @@ -2599,7 +2599,7 @@ def set_sticky( if not all(name is None for name in self.index.names): styles.append( { - "selector": f"thead tr:nth-child({obj.nlevels+1}) th", + "selector": f"thead tr:nth-child({obj.nlevels + 1}) th", "props": props + ( f"top:{(len(levels_)) * pixel_size}px; " @@ -2619,7 +2619,7 @@ def set_sticky( styles.extend( [ { - "selector": f"thead tr th:nth-child({level+1})", + "selector": f"thead tr th:nth-child({level + 1})", "props": props_ + "z-index:3 !important;", }, { @@ -4214,8 +4214,10 @@ def css_bar(start: float, end: float, color: str) -> str: if end > start: cell_css += "background: linear-gradient(90deg," if start > 0: - cell_css += f" transparent {start*100:.1f}%, {color} {start*100:.1f}%," - cell_css += f" {color} {end*100:.1f}%, transparent {end*100:.1f}%)" + cell_css += ( + f" transparent {start * 100:.1f}%, {color} {start * 100:.1f}%," + ) + cell_css += f" {color} {end * 100:.1f}%, transparent {end * 100:.1f}%)" return cell_css def css_calc(x, left: float, right: float, align: str, color: str | list | tuple): diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 2d1218b007d19..482ed316c7ce4 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -850,10 +850,7 @@ def _generate_body_row( data_element = _element( "td", - ( - f"{self.css['data']} {self.css['row']}{r} " - f"{self.css['col']}{c}{cls}" - ), + (f"{self.css['data']} {self.css['row']}{r} {self.css['col']}{c}{cls}"), value, data_element_visible, attributes="", @@ -973,7 +970,7 @@ def concatenated_visible_rows(obj): idx_len = d["index_lengths"].get((lvl, r), None) if idx_len is not None: # i.e. not a sparsified entry d["clines"][rn + idx_len].append( - f"\\cline{{{lvln+1}-{len(visible_index_levels)+data_len}}}" + f"\\cline{{{lvln + 1}-{len(visible_index_levels) + data_len}}}" # noqa: E501 ) def format( @@ -1557,7 +1554,7 @@ def relabel_index( >>> df = pd.DataFrame({"samples": np.random.rand(10)}) >>> styler = df.loc[np.random.randint(0, 10, 3)].style - >>> styler.relabel_index([f"sample{i+1} ({{}})" for i in range(3)]) + >>> styler.relabel_index([f"sample{i + 1} ({{}})" for i in range(3)]) ... # doctest: +SKIP samples sample1 (5) 0.315811 @@ -2520,7 +2517,7 @@ def color(value, user_arg, command, comm_arg): if value[0] == "#" and len(value) == 7: # color is hex code return command, f"[HTML]{{{value[1:].upper()}}}{arg}" if value[0] == "#" and len(value) == 4: # color is short hex code - val = f"{value[1].upper()*2}{value[2].upper()*2}{value[3].upper()*2}" + val = f"{value[1].upper() * 2}{value[2].upper() * 2}{value[3].upper() * 2}" return command, f"[HTML]{{{val}}}{arg}" elif value[:3] == "rgb": # color is rgb or rgba r = re.findall("(?<=\\()[0-9\\s%]+(?=,)", value)[0].strip() diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index 47f162e93216d..febf43b9a1018 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -260,7 +260,7 @@ def _other_namespaces(self) -> dict: nmsp_dict: dict[str, str] = {} if self.namespaces: nmsp_dict = { - f"xmlns{p if p=='' else f':{p}'}": n + f"xmlns{p if p == '' else f':{p}'}": n for p, n in self.namespaces.items() if n != self.prefix_uri[1:-1] } @@ -404,7 +404,7 @@ def _get_prefix_uri(self) -> str: f"{self.prefix} is not included in namespaces" ) from err elif "" in self.namespaces: - uri = f'{{{self.namespaces[""]}}}' + uri = f"{{{self.namespaces['']}}}" else: uri = "" @@ -502,7 +502,7 @@ def _get_prefix_uri(self) -> str: f"{self.prefix} is not included in namespaces" ) from err elif "" in self.namespaces: - uri = f'{{{self.namespaces[""]}}}' + uri = f"{{{self.namespaces['']}}}" else: uri = "" diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 237518b3c8d92..703a2b3656c9c 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -917,7 +917,7 @@ def _combine_lines(self, lines) -> str: Combines a list of JSON objects into one JSON object. """ return ( - f'[{",".join([line for line in (line.strip() for line in lines) if line])}]' + f"[{','.join([line for line in (line.strip() for line in lines) if line])}]" ) @overload diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index e263c69376d05..c283f600eb971 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -112,8 +112,7 @@ def __init__(self, kwds) -> None: parse_dates = bool(parse_dates) elif not isinstance(parse_dates, list): raise TypeError( - "Only booleans and lists are accepted " - "for the 'parse_dates' parameter" + "Only booleans and lists are accepted for the 'parse_dates' parameter" ) self.parse_dates: bool | list = parse_dates self.date_parser = kwds.pop("date_parser", lib.no_default) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index db9547a18b600..e7b5c7f06a79a 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -595,8 +595,7 @@ def _infer_columns( joi = list(map(str, header[:-1] if have_mi_columns else header)) msg = f"[{','.join(joi)}], len of {len(joi)}, " raise ValueError( - f"Passed header={msg}" - f"but only {self.line_pos} lines in file" + f"Passed header={msg}but only {self.line_pos} lines in file" ) from err # We have an empty file, so check @@ -1219,8 +1218,7 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]: for row_num, actual_len in bad_lines: msg = ( - f"Expected {col_len} fields in line {row_num + 1}, saw " - f"{actual_len}" + f"Expected {col_len} fields in line {row_num + 1}, saw {actual_len}" ) if ( self.delimiter diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 54877017f76fc..67193f930b4dc 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1219,8 +1219,7 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: and value != getattr(value, "value", default) ): raise ValueError( - f"The {argname!r} option is not supported with the " - f"'pyarrow' engine" + f"The {argname!r} option is not supported with the 'pyarrow' engine" ) options[argname] = value @@ -1396,8 +1395,7 @@ def _clean_options( if not is_integer(skiprows) and skiprows is not None: # pyarrow expects skiprows to be passed as an integer raise ValueError( - "skiprows argument must be an integer when using " - "engine='pyarrow'" + "skiprows argument must be an integer when using engine='pyarrow'" ) else: if is_integer(skiprows): diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index 89dbdab64c23c..a9c45e720fd56 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -33,19 +33,16 @@ ReadBuffer, ) _correct_line1 = ( - "HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!" - "000000000000000000000000000000 " + "HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!000000000000000000000000000000 " ) _correct_header1 = ( "HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!000000000000000001600000000" ) _correct_header2 = ( - "HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!" - "000000000000000000000000000000 " + "HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!000000000000000000000000000000 " ) _correct_obs_header = ( - "HEADER RECORD*******OBS HEADER RECORD!!!!!!!" - "000000000000000000000000000000 " + "HEADER RECORD*******OBS HEADER RECORD!!!!!!!000000000000000000000000000000 " ) _fieldkeys = [ "ntype", diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index aee872f9ae50a..9670b5439c87e 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -247,11 +247,14 @@ def hist_frame( .. plot:: :context: close-figs - >>> data = {"length": [1.5, 0.5, 1.2, 0.9, 3], "width": [0.7, 0.2, 0.15, 0.2, 1.1]} + >>> data = { + ... "length": [1.5, 0.5, 1.2, 0.9, 3], + ... "width": [0.7, 0.2, 0.15, 0.2, 1.1], + ... } >>> index = ["pig", "rabbit", "duck", "chicken", "horse"] >>> df = pd.DataFrame(data, index=index) >>> hist = df.hist(bins=3) - """ # noqa: E501 + """ plot_backend = _get_plot_backend(backend) return plot_backend.hist_frame( data, @@ -845,7 +848,10 @@ class PlotAccessor(PandasObject): :context: close-figs >>> df = pd.DataFrame( - ... {"length": [1.5, 0.5, 1.2, 0.9, 3], "width": [0.7, 0.2, 0.15, 0.2, 1.1]}, + ... { + ... "length": [1.5, 0.5, 1.2, 0.9, 3], + ... "width": [0.7, 0.2, 0.15, 0.2, 1.1], + ... }, ... index=["pig", "rabbit", "duck", "chicken", "horse"], ... ) >>> plot = df.plot(title="DataFrame Plot") @@ -866,7 +872,7 @@ class PlotAccessor(PandasObject): >>> df = pd.DataFrame({"col1": [1, 2, 3, 4], "col2": ["A", "B", "A", "B"]}) >>> plot = df.groupby("col2").plot(kind="bar", title="DataFrameGroupBy Plot") - """ # noqa: E501 + """ _common_kinds = ("line", "bar", "barh", "kde", "density", "area", "hist", "box") _series_kinds = ("pie",) @@ -993,8 +999,7 @@ def __call__(self, *args, **kwargs): if kind not in self._all_kinds: raise ValueError( - f"{kind} is not a valid plot kind " - f"Valid plot kinds: {self._all_kinds}" + f"{kind} is not a valid plot kind Valid plot kinds: {self._all_kinds}" ) data = self._parent @@ -1630,7 +1635,9 @@ def area( ... "signups": [5, 5, 6, 12, 14, 13], ... "visits": [20, 42, 28, 62, 81, 50], ... }, - ... index=pd.date_range(start="2018/01/01", end="2018/07/01", freq="ME"), + ... index=pd.date_range( + ... start="2018/01/01", end="2018/07/01", freq="ME" + ... ), ... ) >>> ax = df.plot.area() @@ -1662,7 +1669,7 @@ def area( ... } ... ) >>> ax = df.plot.area(x="day") - """ # noqa: E501 + """ return self(kind="area", x=x, y=y, stacked=stacked, **kwargs) def pie(self, y: IndexLabel | None = None, **kwargs) -> PlotAccessor: diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 5ad30a68ae3c9..af77972da8634 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -123,8 +123,7 @@ def _validate_color_args(self, color, colormap): if colormap is not None: warnings.warn( - "'color' and 'colormap' cannot be used " - "simultaneously. Using 'color'", + "'color' and 'colormap' cannot be used simultaneously. Using 'color'", stacklevel=find_stack_level(), ) diff --git a/pandas/tests/arrays/interval/test_formats.py b/pandas/tests/arrays/interval/test_formats.py index 535efee519374..88c9bf81d718c 100644 --- a/pandas/tests/arrays/interval/test_formats.py +++ b/pandas/tests/arrays/interval/test_formats.py @@ -6,8 +6,6 @@ def test_repr(): arr = IntervalArray.from_tuples([(0, 1), (1, 2)]) result = repr(arr) expected = ( - "\n" - "[(0, 1], (1, 2]]\n" - "Length: 2, dtype: interval[int64, right]" + "\n[(0, 1], (1, 2]]\nLength: 2, dtype: interval[int64, right]" ) assert result == expected diff --git a/pandas/tests/dtypes/cast/test_downcast.py b/pandas/tests/dtypes/cast/test_downcast.py index 9430ba2c478ae..69200b2e5fc96 100644 --- a/pandas/tests/dtypes/cast/test_downcast.py +++ b/pandas/tests/dtypes/cast/test_downcast.py @@ -33,9 +33,9 @@ ( # This is a judgement call, but we do _not_ downcast Decimal # objects - np.array([decimal.Decimal(0.0)]), + np.array([decimal.Decimal("0.0")]), "int64", - np.array([decimal.Decimal(0.0)]), + np.array([decimal.Decimal("0.0")]), ), ( # GH#45837 diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index b7e37ff270e60..621217a8c9317 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -660,8 +660,7 @@ def test_construction_generic(self, subtype): def test_construction_not_supported(self, subtype): # GH 19016 msg = ( - "category, object, and string subtypes are not supported " - "for IntervalDtype" + "category, object, and string subtypes are not supported for IntervalDtype" ) with pytest.raises(TypeError, match=msg): IntervalDtype(subtype) diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 73c462d492d2d..c61cda83cf6e0 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -321,7 +321,7 @@ def test_period(self): def test_decimal(self): # scalars GH#23530 - a = Decimal(1.0) + a = Decimal("1.0") assert isna(a) is False assert notna(a) is True diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 27fa1206f6f7f..1f3680bf67e90 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -139,8 +139,8 @@ def test_getitem_invalid(self, data): "index out of bounds", # pyarrow "Out of bounds access", # Sparse f"loc must be an integer between -{ub} and {ub}", # Sparse - f"index {ub+1} is out of bounds for axis 0 with size {ub}", - f"index -{ub+1} is out of bounds for axis 0 with size {ub}", + f"index {ub + 1} is out of bounds for axis 0 with size {ub}", + f"index -{ub + 1} is out of bounds for axis 0 with size {ub}", ] ) with pytest.raises(IndexError, match=msg): diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index a68c8a06e1d18..b110911bda400 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -176,8 +176,7 @@ def take(self, indexer, allow_fill=False, fill_value=None): # an ndarary. indexer = np.asarray(indexer) msg = ( - "Index is out of bounds or cannot do a " - "non-empty take from an empty array." + "Index is out of bounds or cannot do a non-empty take from an empty array." ) if allow_fill: diff --git a/pandas/tests/extension/list/array.py b/pandas/tests/extension/list/array.py index da53bdcb4e37e..8b4728c7d6292 100644 --- a/pandas/tests/extension/list/array.py +++ b/pandas/tests/extension/list/array.py @@ -81,8 +81,7 @@ def take(self, indexer, allow_fill=False, fill_value=None): # an ndarary. indexer = np.asarray(indexer) msg = ( - "Index is out of bounds or cannot do a " - "non-empty take from an empty array." + "Index is out of bounds or cannot do a non-empty take from an empty array." ) if allow_fill: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 4fccf02e08bd6..d6f428f4938a6 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -964,8 +964,7 @@ def _get_arith_xfail_marker(self, opname, pa_dtype): mark = pytest.mark.xfail( raises=TypeError, reason=( - f"{opname} not supported between" - f"pd.NA and {pa_dtype} Python scalar" + f"{opname} not supported betweenpd.NA and {pa_dtype} Python scalar" ), ) elif opname == "__rfloordiv__" and ( diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index 74e4383950174..462d86cadde88 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -522,7 +522,7 @@ def test_info_int_columns(using_infer_string): 0 1 2 non-null int64 1 2 2 non-null int64 dtypes: int64(2) - memory usage: {'50.0' if using_infer_string and HAS_PYARROW else '48.0+'} bytes + memory usage: {"50.0" if using_infer_string and HAS_PYARROW else "48.0+"} bytes """ ) assert result == expected diff --git a/pandas/tests/frame/methods/test_sample.py b/pandas/tests/frame/methods/test_sample.py index 91d735a8b2fa7..a9d56cbfd2b46 100644 --- a/pandas/tests/frame/methods/test_sample.py +++ b/pandas/tests/frame/methods/test_sample.py @@ -198,8 +198,7 @@ def test_sample_upsampling_without_replacement(self, frame_or_series): obj = tm.get_obj(obj, frame_or_series) msg = ( - "Replace has to be set to `True` when " - "upsampling the population `frac` > 1." + "Replace has to be set to `True` when upsampling the population `frac` > 1." ) with pytest.raises(ValueError, match=msg): obj.sample(frac=2, replace=False) diff --git a/pandas/tests/frame/methods/test_set_axis.py b/pandas/tests/frame/methods/test_set_axis.py index 1967941bca9f0..7b75bcf4f348d 100644 --- a/pandas/tests/frame/methods/test_set_axis.py +++ b/pandas/tests/frame/methods/test_set_axis.py @@ -93,7 +93,7 @@ def test_set_axis_setattr_index_wrong_length(self, obj): # wrong length msg = ( f"Length mismatch: Expected axis has {len(obj)} elements, " - f"new values have {len(obj)-1} elements" + f"new values have {len(obj) - 1} elements" ) with pytest.raises(ValueError, match=msg): obj.index = np.arange(len(obj) - 1) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 20309e852a556..e49be8c00b426 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -990,7 +990,7 @@ def test_sort(): # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 10)}) - labels = [f"{i} - {i+499}" for i in range(0, 10000, 500)] + labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) df = df.sort_values(by=["value"], ascending=True) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index ba13d3bd7278f..864b9e5d55991 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -263,10 +263,7 @@ def test_groupby_raises_string_np( if using_infer_string: if groupby_func_np is np.mean: klass = TypeError - msg = ( - f"Cannot perform reduction '{groupby_func_np.__name__}' " - "with string dtype" - ) + msg = f"Cannot perform reduction '{groupby_func_np.__name__}' with string dtype" _call_and_check(klass, msg, how, gb, groupby_func_np, ()) diff --git a/pandas/tests/indexes/categorical/test_indexing.py b/pandas/tests/indexes/categorical/test_indexing.py index 49eb79da616e7..25232075a07d9 100644 --- a/pandas/tests/indexes/categorical/test_indexing.py +++ b/pandas/tests/indexes/categorical/test_indexing.py @@ -64,8 +64,7 @@ def test_take_fill_value(self): tm.assert_categorical_equal(result.values, expected.values) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) @@ -103,8 +102,7 @@ def test_take_fill_value_datetime(self): tm.assert_index_equal(result, expected) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) diff --git a/pandas/tests/indexes/datetimes/methods/test_round.py b/pandas/tests/indexes/datetimes/methods/test_round.py index cde4a3a65804d..b023542ba0a4c 100644 --- a/pandas/tests/indexes/datetimes/methods/test_round.py +++ b/pandas/tests/indexes/datetimes/methods/test_round.py @@ -216,6 +216,6 @@ def test_round_int64(self, start, index_freq, periods, round_freq): assert (mod == 0).all(), f"round not a {round_freq} multiple" assert (diff <= unit // 2).all(), "round error" if unit % 2 == 0: - assert ( - result.asi8[diff == unit // 2] % 2 == 0 - ).all(), "round half to even error" + assert (result.asi8[diff == unit // 2] % 2 == 0).all(), ( + "round half to even error" + ) diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py index 4551fdf073193..f4e0a63043335 100644 --- a/pandas/tests/indexes/datetimes/test_formats.py +++ b/pandas/tests/indexes/datetimes/test_formats.py @@ -205,12 +205,7 @@ def test_dti_representation_to_series(self, unit): exp3 = "0 2011-01-01\n1 2011-01-02\ndtype: datetime64[ns]" - exp4 = ( - "0 2011-01-01\n" - "1 2011-01-02\n" - "2 2011-01-03\n" - "dtype: datetime64[ns]" - ) + exp4 = "0 2011-01-01\n1 2011-01-02\n2 2011-01-03\ndtype: datetime64[ns]" exp5 = ( "0 2011-01-01 09:00:00+09:00\n" @@ -226,11 +221,7 @@ def test_dti_representation_to_series(self, unit): "dtype: datetime64[ns, US/Eastern]" ) - exp7 = ( - "0 2011-01-01 09:00:00\n" - "1 2011-01-02 10:15:00\n" - "dtype: datetime64[ns]" - ) + exp7 = "0 2011-01-01 09:00:00\n1 2011-01-02 10:15:00\ndtype: datetime64[ns]" with pd.option_context("display.width", 300): for idx, expected in zip( diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index bfbcdcff51ee6..c44345273466c 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -338,8 +338,7 @@ def test_take_fill_value(self): tm.assert_index_equal(result, expected) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) @@ -375,8 +374,7 @@ def test_take_fill_value_with_timezone(self): tm.assert_index_equal(result, expected) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) diff --git a/pandas/tests/indexes/interval/test_constructors.py b/pandas/tests/indexes/interval/test_constructors.py index 8db483751438c..90423149658ab 100644 --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -154,8 +154,7 @@ def test_constructor_empty(self, constructor, breaks, closed): def test_constructor_string(self, constructor, breaks): # GH 19016 msg = ( - "category, object, and string subtypes are not supported " - "for IntervalIndex" + "category, object, and string subtypes are not supported for IntervalIndex" ) with pytest.raises(TypeError, match=msg): constructor(**self.get_kwargs_from_breaks(breaks)) @@ -224,8 +223,7 @@ def test_constructor_errors(self): # GH 19016: categorical data data = Categorical(list("01234abcde"), ordered=True) msg = ( - "category, object, and string subtypes are not supported " - "for IntervalIndex" + "category, object, and string subtypes are not supported for IntervalIndex" ) with pytest.raises(TypeError, match=msg): IntervalIndex.from_arrays(data[:-1], data[1:]) @@ -297,8 +295,7 @@ def test_constructor_errors(self): # GH 19016: categorical data data = Categorical(list("01234abcde"), ordered=True) msg = ( - "category, object, and string subtypes are not supported " - "for IntervalIndex" + "category, object, and string subtypes are not supported for IntervalIndex" ) with pytest.raises(TypeError, match=msg): IntervalIndex.from_breaks(data) diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index 73bbfc91028b3..d45d894c485c9 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -21,12 +21,7 @@ class TestIntervalIndexRendering: [ ( Series, - ( - "(0.0, 1.0] a\n" - "NaN b\n" - "(2.0, 3.0] c\n" - "dtype: object" - ), + ("(0.0, 1.0] a\nNaN b\n(2.0, 3.0] c\ndtype: object"), ), (DataFrame, (" 0\n(0.0, 1.0] a\nNaN b\n(2.0, 3.0] c")), ], diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index d82203a53a60f..f098690be2afa 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -259,8 +259,7 @@ def test_get_indexer(self): def test_get_indexer_nearest(self): midx = MultiIndex.from_tuples([("a", 1), ("b", 2)]) msg = ( - "method='nearest' not implemented yet for MultiIndex; " - "see GitHub issue 9365" + "method='nearest' not implemented yet for MultiIndex; see GitHub issue 9365" ) with pytest.raises(NotImplementedError, match=msg): midx.get_indexer(["a"], method="nearest") diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index 43adc09774914..3c1b98d57b2a0 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -479,8 +479,7 @@ def test_take_fill_value_float64(self): tm.assert_index_equal(result, expected) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) diff --git a/pandas/tests/indexes/period/test_formats.py b/pandas/tests/indexes/period/test_formats.py index 9f36eb1e7a1d1..dc95e19523842 100644 --- a/pandas/tests/indexes/period/test_formats.py +++ b/pandas/tests/indexes/period/test_formats.py @@ -63,8 +63,7 @@ def test_representation(self, method): exp3 = "PeriodIndex(['2011-01-01', '2011-01-02'], dtype='period[D]')" exp4 = ( - "PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], " - "dtype='period[D]')" + "PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], dtype='period[D]')" ) exp5 = "PeriodIndex(['2011', '2012', '2013'], dtype='period[Y-DEC]')" diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 2683e25eda618..00e8262ddfa4c 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -700,8 +700,7 @@ def test_take_fill_value(self): tm.assert_index_equal(result, expected) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 608158d40cf23..5b75bd9afd6df 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1112,8 +1112,7 @@ def test_take_fill_value(self): def test_take_fill_value_none_raises(self): index = Index(list("ABC"), name="xxx") msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index 4a31ae88a757a..dd228e6b713b5 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -419,8 +419,7 @@ class TestIndexConstructionErrors: def test_constructor_overflow_int64(self): # see GH#15832 msg = ( - "The elements provided in the data cannot " - "all be casted to the dtype int64" + "The elements provided in the data cannot all be casted to the dtype int64" ) with pytest.raises(OverflowError, match=msg): Index([np.iinfo(np.uint64).max - 1], dtype="int64") diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index e411555c65bea..426083cb6b67c 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -262,8 +262,7 @@ def test_take_fill_value(self): tm.assert_index_equal(result, expected) msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + "When allow_fill=True and fill_value is not None, all indices must be >= -1" ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index dc95e1bb1b8a0..2f6998a85c80b 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -763,8 +763,7 @@ def test_iloc_mask(self): "(index of the boolean Series and of the " "indexed object do not match).", ("locs", ".iloc"): ( - "iLocation based boolean indexing on an " - "integer type is not available" + "iLocation based boolean indexing on an integer type is not available" ), } diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 34824f0a67985..140cf39b26556 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -910,8 +910,7 @@ def test_corrupt_bytes_raises(self, engine): error = XLRDError msg = ( - "Unsupported format, or corrupt file: Expected BOF " - "record; found b'foo'" + "Unsupported format, or corrupt file: Expected BOF record; found b'foo'" ) elif engine == "calamine": from python_calamine import CalamineError diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py index 71ef1201e523f..0e13b2f94ed58 100644 --- a/pandas/tests/io/excel/test_style.py +++ b/pandas/tests/io/excel/test_style.py @@ -356,6 +356,6 @@ def test_format_hierarchical_rows_periodindex(merge_cells): for cell in formatted_cells: if cell.row != 0 and cell.col == 0: - assert isinstance( - cell.val, Timestamp - ), "Period should be converted to Timestamp" + assert isinstance(cell.val, Timestamp), ( + "Period should be converted to Timestamp" + ) diff --git a/pandas/tests/io/formats/style/test_style.py b/pandas/tests/io/formats/style/test_style.py index ff8a1b9f570ab..b7dcfde327b83 100644 --- a/pandas/tests/io/formats/style/test_style.py +++ b/pandas/tests/io/formats/style/test_style.py @@ -933,7 +933,7 @@ def test_trim(self, df): def test_export(self, df, styler): f = lambda x: "color: red" if x > 0 else "color: blue" - g = lambda x, z: f"color: {z}" if x > 0 else f"color: {z}" + g = lambda x, z: f"color: {z}" style1 = styler style1.map(f).map(g, z="b").highlight_max()._compute() # = render result = style1.export() diff --git a/pandas/tests/io/formats/test_css.py b/pandas/tests/io/formats/test_css.py index c4ecb48006cb1..642a562704344 100644 --- a/pandas/tests/io/formats/test_css.py +++ b/pandas/tests/io/formats/test_css.py @@ -193,8 +193,7 @@ def test_css_border_shorthands(prop, expected): ( "margin: 1px; margin-top: 2px", "", - "margin-left: 1px; margin-right: 1px; " - "margin-bottom: 1px; margin-top: 2px", + "margin-left: 1px; margin-right: 1px; margin-bottom: 1px; margin-top: 2px", ), ("margin-top: 2px", "margin: 1px", "margin: 1px; margin-top: 2px"), ("margin: 1px", "margin-top: 2px", "margin: 1px"), diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 7bf041a50b745..6d762fdeb8d79 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -482,10 +482,7 @@ def test_to_csv_string_with_crlf(self): # case 3: CRLF as line terminator # 'lineterminator' should not change inner element expected_crlf = ( - b"int,str_crlf\r\n" - b"1,abc\r\n" - b'2,"d\r\nef"\r\n' - b'3,"g\r\nh\r\n\r\ni"\r\n' + b'int,str_crlf\r\n1,abc\r\n2,"d\r\nef"\r\n3,"g\r\nh\r\n\r\ni"\r\n' ) df.to_csv(path, lineterminator="\r\n", index=False) with open(path, "rb") as f: diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index b1a437bfdbd8a..9c75314b66fa2 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -94,8 +94,7 @@ def test_to_html_with_column_specific_col_space_raises(): ) msg = ( - "Col_space length\\(\\d+\\) should match " - "DataFrame number of columns\\(\\d+\\)" + "Col_space length\\(\\d+\\) should match DataFrame number of columns\\(\\d+\\)" ) with pytest.raises(ValueError, match=msg): df.to_html(col_space=[30, 40]) diff --git a/pandas/tests/io/formats/test_to_markdown.py b/pandas/tests/io/formats/test_to_markdown.py index 7aa7cebb5120f..f3d9b88cc91e2 100644 --- a/pandas/tests/io/formats/test_to_markdown.py +++ b/pandas/tests/io/formats/test_to_markdown.py @@ -35,8 +35,7 @@ def test_empty_frame(): df.to_markdown(buf=buf) result = buf.getvalue() assert result == ( - "| id | first_name | last_name |\n" - "|------|--------------|-------------|" + "| id | first_name | last_name |\n|------|--------------|-------------|" ) @@ -65,8 +64,7 @@ def test_series(): s.to_markdown(buf=buf) result = buf.getvalue() assert result == ( - "| | foo |\n|---:|------:|\n| 0 | 1 " - "|\n| 1 | 2 |\n| 2 | 3 |" + "| | foo |\n|---:|------:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |" ) diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py index 1e8598c918efe..63c975fd831e7 100644 --- a/pandas/tests/io/formats/test_to_string.py +++ b/pandas/tests/io/formats/test_to_string.py @@ -377,17 +377,11 @@ def test_to_string_small_float_values(self): # sadness per above if _three_digit_exp(): expected = ( - " a\n" - "0 1.500000e+000\n" - "1 1.000000e-017\n" - "2 -5.500000e-007" + " a\n0 1.500000e+000\n1 1.000000e-017\n2 -5.500000e-007" ) else: expected = ( - " a\n" - "0 1.500000e+00\n" - "1 1.000000e-17\n" - "2 -5.500000e-07" + " a\n0 1.500000e+00\n1 1.000000e-17\n2 -5.500000e-07" ) assert result == expected @@ -1210,13 +1204,7 @@ def test_to_string_float_na_spacing(self): ser[::2] = np.nan result = ser.to_string() - expected = ( - "0 NaN\n" - "1 1.5678\n" - "2 NaN\n" - "3 -3.0000\n" - "4 NaN" - ) + expected = "0 NaN\n1 1.5678\n2 NaN\n3 -3.0000\n4 NaN" assert result == expected def test_to_string_with_datetimeindex(self): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 5dc1272880c9b..144b36166261b 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1267,9 +1267,7 @@ def test_default_handler_numpy_unsupported_dtype(self): columns=["a", "b"], ) expected = ( - '[["(1+0j)","(nan+0j)"],' - '["(2.3+0j)","(nan+0j)"],' - '["(4-5j)","(1.2+0j)"]]' + '[["(1+0j)","(nan+0j)"],["(2.3+0j)","(nan+0j)"],["(4-5j)","(1.2+0j)"]]' ) assert df.to_json(default_handler=str, orient="values") == expected @@ -1372,11 +1370,7 @@ def test_tz_is_naive(self): ) def test_tz_range_is_utc(self, tz_range): exp = '["2013-01-01T05:00:00.000Z","2013-01-02T05:00:00.000Z"]' - dfexp = ( - '{"DT":{' - '"0":"2013-01-01T05:00:00.000Z",' - '"1":"2013-01-02T05:00:00.000Z"}}' - ) + dfexp = '{"DT":{"0":"2013-01-01T05:00:00.000Z","1":"2013-01-02T05:00:00.000Z"}}' assert ujson_dumps(tz_range, iso_dates=True) == exp dti = DatetimeIndex(tz_range) @@ -1775,7 +1769,7 @@ def test_read_json_with_url_value(self, url): ) def test_read_json_with_very_long_file_path(self, compression): # GH 46718 - long_json_path = f'{"a" * 1000}.json{compression}' + long_json_path = f"{'a' * 1000}.json{compression}" with pytest.raises( FileNotFoundError, match=f"File {long_json_path} does not exist" ): diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 3c843479b446a..d482eb5fa1a06 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -236,9 +236,9 @@ def test_readjson_chunks_closes(chunksize): ) with reader: reader.read() - assert ( - reader.handles.handle.closed - ), f"didn't close stream with chunksize = {chunksize}" + assert reader.handles.handle.closed, ( + f"didn't close stream with chunksize = {chunksize}" + ) @pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"]) @@ -435,8 +435,7 @@ def test_to_json_append_mode(mode_): # Test ValueError when mode is not supported option df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) msg = ( - f"mode={mode_} is not a valid option." - "Only 'w' and 'a' are currently supported." + f"mode={mode_} is not a valid option.Only 'w' and 'a' are currently supported." ) with pytest.raises(ValueError, match=msg): df.to_json(mode=mode_, lines=False, orient="records") diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index c5ccc3b3f7184..8f49afdb1f289 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -991,7 +991,7 @@ def test_decode_array(self, arr): def test_decode_extreme_numbers(self, extreme_num): assert extreme_num == ujson.ujson_loads(str(extreme_num)) - @pytest.mark.parametrize("too_extreme_num", [f"{2**64}", f"{-2**63-1}"]) + @pytest.mark.parametrize("too_extreme_num", [f"{2**64}", f"{-(2**63) - 1}"]) def test_decode_too_extreme_numbers(self, too_extreme_num): with pytest.raises( ValueError, @@ -1006,7 +1006,7 @@ def test_decode_with_trailing_non_whitespaces(self): with pytest.raises(ValueError, match="Trailing data"): ujson.ujson_loads("{}\n\t a") - @pytest.mark.parametrize("value", [f"{2**64}", f"{-2**63-1}"]) + @pytest.mark.parametrize("value", [f"{2**64}", f"{-(2**63) - 1}"]) def test_decode_array_with_big_int(self, value): with pytest.raises( ValueError, diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index ed2e729430b01..a73327beea8bb 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -131,8 +131,7 @@ def test_catch_too_many_names(all_parsers): msg = ( "Too many columns specified: expected 4 and found 3" if parser.engine == "c" - else "Number of passed names did not match " - "number of header fields in the file" + else "Number of passed names did not match number of header fields in the file" ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index d3789cd387c05..55c8bbc4bb9e1 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -136,7 +136,7 @@ def test_mangled_unnamed_placeholders(all_parsers): expected = DataFrame(columns=Index([], dtype="str")) for j in range(i + 1): - col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1) + col_name = "Unnamed: 0" + f".{1 * j}" * min(j, 1) expected.insert(loc=0, column=col_name, value=[0, 1, 2]) expected[orig_key] = orig_value diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 1411ed5019766..9a15d9bc84a2e 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -228,7 +228,7 @@ def test_parse_tz_aware(all_parsers): def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs): # see gh-5636 parser = all_parsers - msg = "Only booleans and lists " "are accepted for the 'parse_dates' parameter" + msg = "Only booleans and lists are accepted for the 'parse_dates' parameter" data = """A,B,C 1,2,2003-11-1""" @@ -239,7 +239,7 @@ def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs): @pytest.mark.parametrize("parse_dates", [(1,), np.array([4, 5]), {1, 3}]) def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates): parser = all_parsers - msg = "Only booleans and lists " "are accepted for the 'parse_dates' parameter" + msg = "Only booleans and lists are accepted for the 'parse_dates' parameter" data = """A,B,C 1,2,2003-11-1""" diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 5c07a56c9fb3f..d897d251909fe 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1503,8 +1503,7 @@ def test_bad_xml(parser): with pytest.raises( SyntaxError, match=( - "Extra content at the end of the document|" - "junk after document element" + "Extra content at the end of the document|junk after document element" ), ): read_xml( diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 9675b936c171e..c3b0219971446 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -427,7 +427,7 @@ def test_pie_series_autopct_and_fontsize(self): ax = _check_plot_works( series.plot.pie, colors=color_args, autopct="%.2f", fontsize=7 ) - pcts = [f"{s*100:.2f}" for s in series.values / series.sum()] + pcts = [f"{s * 100:.2f}" for s in series.values / series.sum()] expected_texts = list(chain.from_iterable(zip(series.index, pcts))) _check_text_labels(ax.texts, expected_texts) for t in ax.texts: diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 30e2c9dfe3d30..3cc95922e7f2f 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -353,7 +353,7 @@ def test_groupby_resample_interpolate_raises(groupy_test_df): for df in dfs: with pytest.raises( NotImplementedError, - match="Direct interpolation of MultiIndex data frames is " "not supported", + match="Direct interpolation of MultiIndex data frames is not supported", ): df.groupby("volume").resample("1D").interpolate(method="linear") diff --git a/pandas/tests/reshape/merge/test_merge_cross.py b/pandas/tests/reshape/merge/test_merge_cross.py index 14f9036e43fce..6ab80cf0e0823 100644 --- a/pandas/tests/reshape/merge/test_merge_cross.py +++ b/pandas/tests/reshape/merge/test_merge_cross.py @@ -42,8 +42,7 @@ def test_merge_cross_error_reporting(kwargs): left = DataFrame({"a": [1, 3]}) right = DataFrame({"b": [3, 4]}) msg = ( - "Can not pass on, right_on, left_on or set right_index=True or " - "left_index=True" + "Can not pass on, right_on, left_on or set right_index=True or left_index=True" ) with pytest.raises(MergeError, match=msg): merge(left, right, how="cross", **kwargs) @@ -94,8 +93,7 @@ def test_join_cross_error_reporting(): left = DataFrame({"a": [1, 3]}) right = DataFrame({"a": [3, 4]}) msg = ( - "Can not pass on, right_on, left_on or set right_index=True or " - "left_index=True" + "Can not pass on, right_on, left_on or set right_index=True or left_index=True" ) with pytest.raises(MergeError, match=msg): left.join(right, how="cross", on="a") diff --git a/pandas/tests/scalar/timedelta/test_constructors.py b/pandas/tests/scalar/timedelta/test_constructors.py index e029dfc3b2703..45caeb1733590 100644 --- a/pandas/tests/scalar/timedelta/test_constructors.py +++ b/pandas/tests/scalar/timedelta/test_constructors.py @@ -353,8 +353,7 @@ def test_construction(): Timedelta("foo") msg = ( - "cannot construct a Timedelta from " - "the passed arguments, allowed keywords are " + "cannot construct a Timedelta from the passed arguments, allowed keywords are " ) with pytest.raises(ValueError, match=msg): Timedelta(day=10) diff --git a/pandas/tests/series/methods/test_between.py b/pandas/tests/series/methods/test_between.py index e67eafbd118ce..f035767e2ce0e 100644 --- a/pandas/tests/series/methods/test_between.py +++ b/pandas/tests/series/methods/test_between.py @@ -66,8 +66,7 @@ def test_between_error_args(self, inclusive): left, right = series[[2, 7]] value_error_msg = ( - "Inclusive has to be either string of 'both'," - "'left', 'right', or 'neither'." + "Inclusive has to be either string of 'both','left', 'right', or 'neither'." ) series = Series(date_range("1/1/2000", periods=10)) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 74b051aec71a4..566fd8d901569 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1935,7 +1935,7 @@ def test_to_datetime_unit_na_values(self): @pytest.mark.parametrize("bad_val", ["foo", 111111111]) def test_to_datetime_unit_invalid(self, bad_val): if bad_val == "foo": - msg = "Unknown datetime string format, unable to parse: " f"{bad_val}" + msg = f"Unknown datetime string format, unable to parse: {bad_val}" else: msg = "cannot convert input 111111111 with the unit 'D'" with pytest.raises(ValueError, match=msg): @@ -2258,7 +2258,7 @@ def test_to_datetime_iso8601_exact_fails(self, input, format): [ '^unconverted data remains when parsing with format ".*": ".*". ' f"{PARSING_ERR_MSG}$", - f'^time data ".*" doesn\'t match format ".*". ' f"{PARSING_ERR_MSG}$", + f'^time data ".*" doesn\'t match format ".*". {PARSING_ERR_MSG}$', ] ) with pytest.raises( diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index f3645bf0649bd..893f526fb3eb0 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -192,7 +192,7 @@ def test_numeric_df_columns(columns): # see gh-14827 df = DataFrame( { - "a": [1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), "0.1"], + "a": [1.2, decimal.Decimal("3.14"), decimal.Decimal("infinity"), "0.1"], "b": [1.0, 2.0, 3.0, 4.0], } ) @@ -207,10 +207,10 @@ def test_numeric_df_columns(columns): "data,exp_data", [ ( - [[decimal.Decimal(3.14), 1.0], decimal.Decimal(1.6), 0.1], + [[decimal.Decimal("3.14"), 1.0], decimal.Decimal("1.6"), 0.1], [[3.14, 1.0], 1.6, 0.1], ), - ([np.array([decimal.Decimal(3.14), 1.0]), 0.1], [[3.14, 1.0], 0.1]), + ([np.array([decimal.Decimal("3.14"), 1.0]), 0.1], [[3.14, 1.0], 0.1]), ], ) def test_numeric_embedded_arr_likes(data, exp_data): diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index d0192c12f9518..7480b99595066 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -798,9 +798,9 @@ def test_get_offset(): for name, expected in pairs: offset = _get_offset(name) - assert ( - offset == expected - ), f"Expected {name!r} to yield {expected!r} (actual: {offset!r})" + assert offset == expected, ( + f"Expected {name!r} to yield {expected!r} (actual: {offset!r})" + ) def test_get_offset_legacy(): diff --git a/pandas/tests/tseries/offsets/test_ticks.py b/pandas/tests/tseries/offsets/test_ticks.py index f91230e1460c4..46b6846ad1ec2 100644 --- a/pandas/tests/tseries/offsets/test_ticks.py +++ b/pandas/tests/tseries/offsets/test_ticks.py @@ -289,8 +289,7 @@ def test_tick_rdiv(cls): td64 = delta.to_timedelta64() instance__type = ".".join([cls.__module__, cls.__name__]) msg = ( - "unsupported operand type\\(s\\) for \\/: 'int'|'float' and " - f"'{instance__type}'" + f"unsupported operand type\\(s\\) for \\/: 'int'|'float' and '{instance__type}'" ) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 07425af8ed37a..bc5cd5fcccbf8 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -134,10 +134,7 @@ def test_does_not_convert_mixed_integer(date_string, expected): ( "2013Q1", {"freq": "INVLD-L-DEC-SAT"}, - ( - "Unable to retrieve month information " - "from given freq: INVLD-L-DEC-SAT" - ), + ("Unable to retrieve month information from given freq: INVLD-L-DEC-SAT"), ), ], ) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 9a01568971af8..88ea1bfa3c6ed 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -145,8 +145,7 @@ def infer_freq( pass elif isinstance(index.dtype, PeriodDtype): raise TypeError( - "PeriodIndex given. Check the `freq` attribute " - "instead of using infer_freq." + "PeriodIndex given. Check the `freq` attribute instead of using infer_freq." ) elif lib.is_np_dtype(index.dtype, "m"): # Allow TimedeltaIndex and TimedeltaArray diff --git a/pyproject.toml b/pyproject.toml index 7ab9cd2c17669..c6af69438f849 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -746,5 +746,5 @@ exclude_lines = [ directory = "coverage_html_report" [tool.codespell] -ignore-words-list = "blocs, coo, hist, nd, sav, ser, recuse, nin, timere, expec, expecs, indext, SME, NotIn, tructures, tru" +ignore-words-list = "blocs, coo, hist, nd, sav, ser, recuse, nin, timere, expec, expecs, indext, SME, NotIn, tructures, tru, indx, abd, ABD" ignore-regex = 'https://([\w/\.])+' From 3866b98121e84b6fd01ed08de008372aa50e0841 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Mon, 3 Feb 2025 14:07:49 -0800 Subject: [PATCH 13/68] DOC: Fix description of skipna parameter in groupby reductions (#60842) --- pandas/core/groupby/groupby.py | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index fdf2aab434695..27865a60f6ea3 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2170,8 +2170,7 @@ def mean( numeric_only no longer accepts ``None`` and defaults to ``False``. skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. + Exclude NA/null values. If an entire group is NA, the result will be NA. .. versionadded:: 3.0.0 @@ -2271,8 +2270,7 @@ def median(self, numeric_only: bool = False, skipna: bool = True) -> NDFrameT: numeric_only no longer accepts ``None`` and defaults to False. skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. + Exclude NA/null values. If an entire group is NA, the result will be NA. .. versionadded:: 3.0.0 @@ -2405,8 +2403,7 @@ def std( numeric_only now defaults to ``False``. skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. + Exclude NA/null values. If an entire group is NA, the result will be NA. .. versionadded:: 3.0.0 @@ -2524,8 +2521,7 @@ def var( numeric_only now defaults to ``False``. skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. + Exclude NA/null values. If an entire group is NA, the result will be NA. .. versionadded:: 3.0.0 @@ -2742,8 +2738,7 @@ def sem( numeric_only now defaults to ``False``. skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. + Exclude NA/null values. If an entire group is NA, the result will be NA. .. versionadded:: 3.0.0 @@ -3021,8 +3016,7 @@ def prod( than ``min_count`` non-NA values are present the result will be NA. skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. + Exclude NA/null values. If an entire group is NA, the result will be NA. .. versionadded:: 3.0.0 @@ -3242,8 +3236,7 @@ def first( The required number of valid values to perform the operation. If fewer than ``min_count`` valid values are present the result will be NA. skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. + Exclude NA/null values. If an entire group is NA, the result will be NA. .. versionadded:: 2.2.1 @@ -3329,8 +3322,7 @@ def last( The required number of valid values to perform the operation. If fewer than ``min_count`` valid values are present the result will be NA. skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. + Exclude NA/null values. If an entire group is NA, the result will be NA. .. versionadded:: 2.2.1 @@ -5530,8 +5522,7 @@ def _idxmax_idxmin( numeric_only : bool, default False Include only float, int, boolean columns. skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. + Exclude NA/null values. If an entire group is NA, the result will be NA. ignore_unobserved : bool, default False When True and an unobserved group is encountered, do not raise. This used for transform where unobserved groups do not play an impact on the result. From fc6da9c7f590ffd2eaec801060ee4b239fbf3d92 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Mon, 3 Feb 2025 14:22:19 -0800 Subject: [PATCH 14/68] TST: parametrize Decimal ujson test (#60843) --- pandas/tests/io/json/test_ujson.py | 70 ++++++++---------------------- 1 file changed, 17 insertions(+), 53 deletions(-) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 8f49afdb1f289..d2bf9bdb139bd 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -53,60 +53,24 @@ def orient(request): class TestUltraJSONTests: @pytest.mark.skipif(not IS64, reason="not compliant on 32-bit, xref #15865") - def test_encode_decimal(self): - sut = decimal.Decimal("1337.1337") - encoded = ujson.ujson_dumps(sut, double_precision=15) - decoded = ujson.ujson_loads(encoded) - assert decoded == "1337.1337" - - sut = decimal.Decimal("0.95") - encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == '"0.95"' - - decoded = ujson.ujson_loads(encoded) - assert decoded == "0.95" - - sut = decimal.Decimal("0.94") - encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == '"0.94"' - - decoded = ujson.ujson_loads(encoded) - assert decoded == "0.94" - - sut = decimal.Decimal("1.95") - encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == '"1.95"' - - decoded = ujson.ujson_loads(encoded) - assert decoded == "1.95" - - sut = decimal.Decimal("-1.95") - encoded = ujson.ujson_dumps(sut, double_precision=1) - assert encoded == '"-1.95"' - - decoded = ujson.ujson_loads(encoded) - assert decoded == "-1.95" - - sut = decimal.Decimal("0.995") - encoded = ujson.ujson_dumps(sut, double_precision=2) - assert encoded == '"0.995"' - - decoded = ujson.ujson_loads(encoded) - assert decoded == "0.995" - - sut = decimal.Decimal("0.9995") - encoded = ujson.ujson_dumps(sut, double_precision=3) - assert encoded == '"0.9995"' - - decoded = ujson.ujson_loads(encoded) - assert decoded == "0.9995" - - sut = decimal.Decimal("0.99999999999999944") - encoded = ujson.ujson_dumps(sut, double_precision=15) - assert encoded == '"0.99999999999999944"' - + @pytest.mark.parametrize( + "value, double_precision", + [ + ("1337.1337", 15), + ("0.95", 1), + ("0.94", 1), + ("1.95", 1), + ("-1.95", 1), + ("0.995", 2), + ("0.9995", 3), + ("0.99999999999999944", 15), + ], + ) + def test_encode_decimal(self, value, double_precision): + sut = decimal.Decimal(value) + encoded = ujson.ujson_dumps(sut, double_precision=double_precision) decoded = ujson.ujson_loads(encoded) - assert decoded == "0.99999999999999944" + assert decoded == value @pytest.mark.parametrize("ensure_ascii", [True, False]) def test_encode_string_conversion(self, ensure_ascii): From e8306037a3a5782b18d3f8db81ae1dbde8ec21bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quang=20Nguy=E1=BB=85n?= <30631476+quangngd@users.noreply.github.com> Date: Tue, 4 Feb 2025 10:27:32 +0700 Subject: [PATCH 15/68] BUG: stack with empty level list (#60826) * return early if set_levels is empty * add test * add whatsnew * check empty before make set --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/reshape/reshape.py | 2 ++ pandas/tests/frame/test_stack_unstack.py | 19 +++++++++++++++++++ 3 files changed, 22 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 9089b9cdd2185..95b5f7eea5eeb 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -766,6 +766,7 @@ Reshaping - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`) - Bug in :meth:`DataFrame.merge` when merging two :class:`DataFrame` on ``intc`` or ``uintc`` types on Windows (:issue:`60091`, :issue:`58713`) - Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`) +- Bug in :meth:`DataFrame.stack` with the new implementation where ``ValueError`` is raised when ``level=[]`` (:issue:`60740`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) Sparse diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 9b7b768fe7adb..c60fe71a7ff28 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -929,6 +929,8 @@ def _reorder_for_extension_array_stack( def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: if frame.columns.nunique() != len(frame.columns): raise ValueError("Columns with duplicate values are not supported in stack") + if not len(level): + return frame set_levels = set(level) stack_cols = frame.columns._drop_level_numbers( [k for k in range(frame.columns.nlevels - 1, -1, -1) if k not in set_levels] diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index abc14d10514fa..22fdfd3a01408 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1452,6 +1452,25 @@ def test_stack_empty_frame(dropna, future_stack): tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated") +@pytest.mark.parametrize("dropna", [True, False, lib.no_default]) +def test_stack_empty_level(dropna, future_stack, int_frame): + # GH 60740 + if future_stack and dropna is not lib.no_default: + with pytest.raises(ValueError, match="dropna must be unspecified"): + DataFrame(dtype=np.int64).stack(dropna=dropna, future_stack=future_stack) + else: + expected = int_frame + result = int_frame.copy().stack( + level=[], dropna=dropna, future_stack=future_stack + ) + tm.assert_frame_equal(result, expected) + + expected = DataFrame() + result = DataFrame().stack(level=[], dropna=dropna, future_stack=future_stack) + tm.assert_frame_equal(result, expected) + + @pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated") @pytest.mark.parametrize("dropna", [True, False, lib.no_default]) @pytest.mark.parametrize("fill_value", [None, 0]) From b2a7a262977391a09a49295dec4bebe0a120e316 Mon Sep 17 00:00:00 2001 From: Shashwat Agrawal <72117025+ShashwatAgrawal20@users.noreply.github.com> Date: Tue, 4 Feb 2025 23:20:48 +0530 Subject: [PATCH 16/68] DOC: `pandas.DataFrame.to_html` additional description for the border parameter (#60830) * should work * fix: proper backticks --- pandas/core/frame.py | 10 +++++++--- pandas/io/formats/format.py | 10 +++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d9f7623064e05..b715e526e0f33 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3205,9 +3205,13 @@ def to_html( Convert the characters <, >, and & to HTML-safe sequences. notebook : {True, False}, default False Whether the generated HTML is for IPython Notebook. - border : int - A ``border=border`` attribute is included in the opening - `` tag. Default ``pd.options.display.html.border``. + border : int or bool + When an integer value is provided, it sets the border attribute in + the opening tag, specifying the thickness of the border. + If ``False`` or ``0`` is passed, the border attribute will not + be present in the ``
`` tag. + The default value for this parameter is governed by + ``pd.options.display.html.border``. table_id : str, optional A css id is included in the opening `
` tag if specified. render_links : bool, default False diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 46ecb2b9a8f12..b7fbc4e5e22b7 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -897,9 +897,13 @@ def to_html( ``
`` tag, in addition to the default "dataframe". notebook : {True, False}, optional, default False Whether the generated HTML is for IPython Notebook. - border : int - A ``border=border`` attribute is included in the opening - ``
`` tag. Default ``pd.options.display.html.border``. + border : int or bool + When an integer value is provided, it sets the border attribute in + the opening tag, specifying the thickness of the border. + If ``False`` or ``0`` is passed, the border attribute will not + be present in the ``
`` tag. + The default value for this parameter is governed by + ``pd.options.display.html.border``. table_id : str, optional A css id is included in the opening `
` tag if specified. render_links : bool, default False From 68569a683a9e1068a82397a113b6dd2d8fa9cdd1 Mon Sep 17 00:00:00 2001 From: Ehsan Totoni Date: Tue, 4 Feb 2025 12:55:17 -0500 Subject: [PATCH 17/68] DOC: Update Bodo project description in ecosystem page (#60846) --- web/pandas/community/ecosystem.md | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index dc7b9bc947214..29297488da64f 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -496,17 +496,29 @@ You can find more information about the Hugging Face Dataset Hub in the [documen ## Out-of-core -### [Bodo](https://bodo.ai/) +### [Bodo](https://github.com/bodo-ai/Bodo) -Bodo is a high-performance Python computing engine that automatically parallelizes and -optimizes your code through compilation using HPC (high-performance computing) techniques. -Designed to operate with native pandas dataframes, Bodo compiles your pandas code to execute -across multiple cores on a single machine or distributed clusters of multiple compute nodes efficiently. -Bodo also makes distributed pandas dataframes queryable with SQL. -The community edition of Bodo is free to use on up to 8 cores. Beyond that, Bodo offers a paid -enterprise edition. Free licenses of Bodo (for more than 8 cores) are available -[upon request](https://www.bodo.ai/contact) for academic and non-profit use. +Bodo is a high-performance compute engine for Python data processing. +Using an auto-parallelizing just-in-time (JIT) compiler, Bodo simplifies scaling Pandas +workloads from laptops to clusters without major code changes. +Under the hood, Bodo relies on MPI-based high-performance computing (HPC) technology—making it +both easier to use and often much faster than alternatives. +Bodo also provides a SQL engine that can query distributed pandas dataframes efficiently. + +```python +import pandas as pd +import bodo + +@bodo.jit +def process_data(): + df = pd.read_parquet("my_data.pq") + df2 = pd.DataFrame({"A": df.apply(lambda r: 0 if r.A == 0 else (r.B // r.A), axis=1)}) + df2.to_parquet("out.pq") + +process_data() +``` + ### [Cylon](https://cylondata.org/) From 898bb02ec05d246bf4aa4f8f279c57d2a7716f9d Mon Sep 17 00:00:00 2001 From: "Christine P. Chai" Date: Tue, 4 Feb 2025 18:48:29 -0800 Subject: [PATCH 18/68] DOC: Update a link in cookbook.rst (#60849) --- doc/source/user_guide/cookbook.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index b2b5c5cc1014e..b3decb6342527 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -874,7 +874,7 @@ Timeseries `__ `Aggregation and plotting time series -`__ +`__ Turn a matrix with hours in columns and days in rows into a continuous row sequence in the form of a time series. `How to rearrange a Python pandas DataFrame? From 1f106e06fd4829f6775dc90511dcbb6e7ab9f9c6 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 5 Feb 2025 23:10:43 +0530 Subject: [PATCH 19/68] DOC: fix ES01 for pandas.DataFrame.select_dtypes (#60855) --- pandas/core/frame.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b715e526e0f33..b36791e6e1dd1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4793,6 +4793,10 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: """ Return a subset of the DataFrame's columns based on the column dtypes. + This method allows for filtering columns based on their data types. + It is useful when working with heterogeneous DataFrames where operations + need to be performed on a specific subset of data types. + Parameters ---------- include, exclude : scalar or list-like From 348fd11b3c89fb7c91c35cb7cdfd302ba7b3d041 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 5 Feb 2025 23:11:21 +0530 Subject: [PATCH 20/68] DOC: fix ES01 for pandas.read_orc (#60851) --- pandas/io/orc.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index a945f3dc38d35..1a2d564d5b44d 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -45,6 +45,13 @@ def read_orc( """ Load an ORC object from the file path, returning a DataFrame. + This method reads an ORC (Optimized Row Columnar) file into a pandas + DataFrame using the `pyarrow.orc` library. ORC is a columnar storage format + that provides efficient compression and fast retrieval for analytical workloads. + It allows reading specific columns, handling different filesystem + types (such as local storage, cloud storage via fsspec, or pyarrow filesystem), + and supports different data type backends, including `numpy_nullable` and `pyarrow`. + Parameters ---------- path : str, path object, or file-like object From aeb634aeb28b05c96c1fac3a8a33825e6227b7d9 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 5 Feb 2025 23:12:00 +0530 Subject: [PATCH 21/68] DOC: fix ES01 for pandas.plotting.table (#60852) --- pandas/plotting/_misc.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 3f839cefe798e..0e0fb23d924bc 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -30,6 +30,13 @@ def table(ax: Axes, data: DataFrame | Series, **kwargs) -> Table: """ Helper function to convert DataFrame and Series to matplotlib.table. + This method provides an easy way to visualize tabular data within a Matplotlib + figure. It automatically extracts index and column labels from the DataFrame + or Series, unless explicitly specified. This function is particularly useful + when displaying summary tables alongside other plots or when creating static + reports. It utilizes the `matplotlib.pyplot.table` backend and allows + customization through various styling options available in Matplotlib. + Parameters ---------- ax : Matplotlib axes object From 59e324f01530c46d91c890e5437828a42797f3c1 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 5 Feb 2025 23:12:34 +0530 Subject: [PATCH 22/68] DOC: fix ES01 for pandas.Interval.is_empty (#60854) --- pandas/_libs/interval.pyx | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 564019d7c0d8c..5d0876591a151 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -209,6 +209,12 @@ cdef class IntervalMixin: """ Indicates if an interval is empty, meaning it contains no points. + An interval is considered empty if its `left` and `right` endpoints + are equal, and it is not closed on both sides. This means that the + interval does not include any real points. In the case of an + :class:`pandas.arrays.IntervalArray` or :class:`IntervalIndex`, the + property returns a boolean array indicating the emptiness of each interval. + Returns ------- bool or ndarray From dc315ee205e4e2dde7ba0c4c649ca81aeab6036e Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 5 Feb 2025 23:13:05 +0530 Subject: [PATCH 23/68] DOC: fix ES01 for pandas.Period.day (#60853) --- pandas/_libs/tslibs/period.pyx | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index f697180da5eeb..087d3119c36f2 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2140,6 +2140,12 @@ cdef class _Period(PeriodMixin): """ Get day of the month that a Period falls on. + The `day` property provides a simple way to access the day component + of a `Period` object, which represents time spans in various frequencies + (e.g., daily, hourly, monthly). If the period's frequency does not include + a day component (e.g., yearly or quarterly periods), the returned day + corresponds to the first day of that period. + Returns ------- int From 1cd4c63493960578c65c37e2902194c3b23baf82 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 5 Feb 2025 09:43:52 -0800 Subject: [PATCH 24/68] TST: Remove test_pickle_generalurl_read in favor of test_request_headers (#60848) Remove test_pickle_generalurl_read in favor of test_request_headers --- pandas/tests/io/test_pickle.py | 49 ---------------------------------- 1 file changed, 49 deletions(-) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 5fe0f1265edff..bab2c1561eb99 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -383,55 +383,6 @@ def test_pickle_buffer_roundtrip(): tm.assert_frame_equal(df, result) -# --------------------- -# tests for URL I/O -# --------------------- - - -@pytest.mark.parametrize( - "mockurl", ["http://url.com", "ftp://test.com", "http://gzip.com"] -) -def test_pickle_generalurl_read(monkeypatch, mockurl): - def python_pickler(obj, path): - with open(path, "wb") as fh: - pickle.dump(obj, fh, protocol=-1) - - class MockReadResponse: - def __init__(self, path) -> None: - self.file = open(path, "rb") - if "gzip" in path: - self.headers = {"Content-Encoding": "gzip"} - else: - self.headers = {"Content-Encoding": ""} - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - def read(self): - return self.file.read() - - def close(self): - return self.file.close() - - with tm.ensure_clean() as path: - - def mock_urlopen_read(*args, **kwargs): - return MockReadResponse(path) - - df = DataFrame( - 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), - ) - python_pickler(df, path) - monkeypatch.setattr("urllib.request.urlopen", mock_urlopen_read) - result = pd.read_pickle(mockurl) - tm.assert_frame_equal(df, result) - - def test_pickle_fsspec_roundtrip(): pytest.importorskip("fsspec") with tm.ensure_clean(): From 51b12e8d5fd67b22904aefda7cbadd4b83ef8970 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Wed, 5 Feb 2025 09:46:46 -0800 Subject: [PATCH 25/68] BUG: Avoid casting to float for datetimelike in min/max reductions (#60850) * BUG: Avoid casting to float for datetimelike in min/max reductions * Fix a line and add whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/nanops.py | 11 ++++++-- pandas/tests/frame/test_reductions.py | 38 +++++++++++++++++++++++++++ 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 95b5f7eea5eeb..bf7bac09b921e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -631,6 +631,7 @@ Datetimelike - Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56147`) - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`) - Bug in :func:`tseries.frequencies.to_offset` would fail to parse frequency strings starting with "LWOM" (:issue:`59218`) +- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` casting ``datetime64`` and ``timedelta64`` columns to ``float64`` and losing precision (:issue:`60850`) - Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` returning ``False`` on double-digit frequencies (:issue:`58523`) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index d6154e2352c63..d1dc0ff809497 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1093,11 +1093,14 @@ def reduction( if values.size == 0: return _na_for_min_count(values, axis) + dtype = values.dtype values, mask = _get_values( values, skipna, fill_value_typ=fill_value_typ, mask=mask ) result = getattr(values, meth)(axis) - result = _maybe_null_out(result, axis, mask, values.shape) + result = _maybe_null_out( + result, axis, mask, values.shape, datetimelike=dtype.kind in "mM" + ) return result return reduction @@ -1499,6 +1502,7 @@ def _maybe_null_out( mask: npt.NDArray[np.bool_] | None, shape: tuple[int, ...], min_count: int = 1, + datetimelike: bool = False, ) -> np.ndarray | float | NaTType: """ Returns @@ -1520,7 +1524,10 @@ def _maybe_null_out( null_mask = np.broadcast_to(below_count, new_shape) if np.any(null_mask): - if is_numeric_dtype(result): + if datetimelike: + # GH#60646 For datetimelike, no need to cast to float + result[null_mask] = iNaT + elif is_numeric_dtype(result): if np.iscomplexobj(result): result = result.astype("c16") elif not is_float_dtype(result): diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 04b1456cdbea6..64e686d25faa7 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1544,6 +1544,44 @@ def test_min_max_dt64_with_NaT(self): exp = Series([pd.NaT], index=["foo"]) tm.assert_series_equal(res, exp) + def test_min_max_dt64_with_NaT_precision(self): + # GH#60646 Make sure the reduction doesn't cast input timestamps to + # float and lose precision. + df = DataFrame( + {"foo": [pd.NaT, pd.NaT, Timestamp("2012-05-01 09:20:00.123456789")]}, + dtype="datetime64[ns]", + ) + + res = df.min(axis=1) + exp = df.foo.rename(None) + tm.assert_series_equal(res, exp) + + res = df.max(axis=1) + exp = df.foo.rename(None) + tm.assert_series_equal(res, exp) + + def test_min_max_td64_with_NaT_precision(self): + # GH#60646 Make sure the reduction doesn't cast input timedeltas to + # float and lose precision. + df = DataFrame( + { + "foo": [ + pd.NaT, + pd.NaT, + to_timedelta("10000 days 06:05:01.123456789"), + ], + }, + dtype="timedelta64[ns]", + ) + + res = df.min(axis=1) + exp = df.foo.rename(None) + tm.assert_series_equal(res, exp) + + res = df.max(axis=1) + exp = df.foo.rename(None) + tm.assert_series_equal(res, exp) + def test_min_max_dt64_with_NaT_skipna_false(self, request, tz_naive_fixture): # GH#36907 tz = tz_naive_fixture From 57340ecd08580f26ee4a976c1f68b2f563c41569 Mon Sep 17 00:00:00 2001 From: Jake Thomas Trevallion <136272202+JakeTT404@users.noreply.github.com> Date: Wed, 5 Feb 2025 17:48:56 +0000 Subject: [PATCH 26/68] ENH: Improved error message and raise new error for small-string NaN edge case in HDFStore.append (#60829) * Add clearer error messages for datatype mismatch in HDFStore.append. Raise ValueError when nan_rep too large for pytable column. Add and modify applicable test code. * Fix missed tests and correct mistake in error message. * Remove excess comments. Reverse error type change to avoid api changes. Move nan_rep tests into separate function. --- pandas/io/pytables.py | 9 ++++++ pandas/tests/io/pytables/test_append.py | 35 +++++++++++++++++---- pandas/tests/io/pytables/test_round_trip.py | 9 ++---- 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index e18db2e53113f..b4c78b063c180 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3524,6 +3524,12 @@ def validate(self, other) -> None: # Value of type "Optional[Any]" is not indexable [index] oax = ov[i] # type: ignore[index] if sax != oax: + if c == "values_axes" and sax.kind != oax.kind: + raise ValueError( + f"Cannot serialize the column [{oax.values[0]}] " + f"because its data contents are not [{sax.kind}] " + f"but [{oax.kind}] object dtype" + ) raise ValueError( f"invalid combination of [{c}] on appending data " f"[{sax}] vs current table [{oax}]" @@ -5136,6 +5142,9 @@ def _maybe_convert_for_string_atom( data = bvalues.copy() data[mask] = nan_rep + if existing_col and mask.any() and len(nan_rep) > existing_col.itemsize: + raise ValueError("NaN representation is too large for existing column size") + # see if we have a valid string type inferred_type = lib.infer_dtype(data, skipna=False) if inferred_type != "string": diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 47658c0eb9012..04241a78bff5f 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -823,12 +823,9 @@ def test_append_raise(setup_path): store.append("df", df) df["foo"] = "bar" msg = re.escape( - "invalid combination of [values_axes] on appending data " - "[name->values_block_1,cname->values_block_1," - "dtype->bytes24,kind->string,shape->(1, 30)] " - "vs current table " - "[name->values_block_1,cname->values_block_1," - "dtype->datetime64[s],kind->datetime64[s],shape->None]" + "Cannot serialize the column [foo] " + "because its data contents are not [string] " + "but [datetime64[s]] object dtype" ) with pytest.raises(ValueError, match=msg): store.append("df", df) @@ -997,3 +994,29 @@ def test_append_to_multiple_min_itemsize(setup_path): ) result = store.select_as_multiple(["index", "nums", "strs"]) tm.assert_frame_equal(result, expected, check_index_type=True) + + +def test_append_string_nan_rep(setup_path): + # GH 16300 + df = DataFrame({"A": "a", "B": "foo"}, index=np.arange(10)) + df_nan = df.copy() + df_nan.loc[0:4, :] = np.nan + msg = "NaN representation is too large for existing column size" + + with ensure_clean_store(setup_path) as store: + # string column too small + store.append("sa", df["A"]) + with pytest.raises(ValueError, match=msg): + store.append("sa", df_nan["A"]) + + # nan_rep too big + store.append("sb", df["B"], nan_rep="bars") + with pytest.raises(ValueError, match=msg): + store.append("sb", df_nan["B"]) + + # smaller modified nan_rep + store.append("sc", df["A"], nan_rep="n") + store.append("sc", df_nan["A"]) + result = store["sc"] + expected = concat([df["A"], df_nan["A"]]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 6b98a720e4299..875a792467828 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -213,12 +213,9 @@ def test_table_values_dtypes_roundtrip(setup_path): # incompatible dtype msg = re.escape( - "invalid combination of [values_axes] on appending data " - "[name->values_block_0,cname->values_block_0," - "dtype->float64,kind->float,shape->(1, 3)] vs " - "current table [name->values_block_0," - "cname->values_block_0,dtype->int64,kind->integer," - "shape->None]" + "Cannot serialize the column [a] " + "because its data contents are not [float] " + "but [integer] object dtype" ) with pytest.raises(ValueError, match=msg): store.append("df_i8", df1) From 0e245de0bd1b71f903bb16a03ff45fc6d7625946 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quang=20Nguy=E1=BB=85n?= <30631476+quangngd@users.noreply.github.com> Date: Thu, 6 Feb 2025 00:49:31 +0700 Subject: [PATCH 27/68] EHN: handle frozenset in pprint (#60828) * Add test * handle frozenset * add whatsnew * Update doc/source/whatsnew/v3.0.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/formats/printing.py | 2 ++ pandas/tests/io/formats/test_printing.py | 3 +++ 3 files changed, 6 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index bf7bac09b921e..5f5fbc4dce2d4 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -68,6 +68,7 @@ Other enhancements - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) - :meth:`Series.str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) +- :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`) - Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`) - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`) - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index b466e986450b1..5a52ee78cb9be 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -111,6 +111,8 @@ def _pprint_seq( """ if isinstance(seq, set): fmt = "{{{body}}}" + elif isinstance(seq, frozenset): + fmt = "frozenset({body})" else: fmt = "[{body}]" if hasattr(seq, "__setitem__") else "({body})" diff --git a/pandas/tests/io/formats/test_printing.py b/pandas/tests/io/formats/test_printing.py index 3b63011bf862e..f86b4af2647f8 100644 --- a/pandas/tests/io/formats/test_printing.py +++ b/pandas/tests/io/formats/test_printing.py @@ -82,6 +82,9 @@ def test_repr_dict(self): def test_repr_mapping(self): assert printing.pprint_thing(MyMapping()) == "{'a': 4, 'b': 4}" + def test_repr_frozenset(self): + assert printing.pprint_thing(frozenset([1, 2])) == "frozenset(1, 2)" + class TestFormatBase: def test_adjoin(self): From 2cc9b21c9ad9b3df0f084b6d2e8462b1b78d4e8a Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Wed, 5 Feb 2025 09:51:37 -0800 Subject: [PATCH 28/68] ENH: Support 'left_anti' and 'right_anti' joins in pd.merge (#60732) * ENH: Support 'left_anti' and 'right_anti' joins in pd.merge * Fix mypy errors * Fix another mypy error * Restructure a bit * Fix mypy typing error * Fix test * Fix arrow string test * Fix future string test * Retry fix * Address review comment --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_typing.py | 4 +- pandas/core/frame.py | 14 +- pandas/core/reshape/merge.py | 97 +++++- pandas/tests/frame/methods/test_join.py | 15 +- pandas/tests/reshape/merge/test_merge.py | 5 +- .../reshape/merge/test_merge_antijoin.py | 280 ++++++++++++++++++ 7 files changed, 400 insertions(+), 16 deletions(-) create mode 100644 pandas/tests/reshape/merge/test_merge_antijoin.py diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 5f5fbc4dce2d4..7ebbfd5bf75be 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -35,6 +35,7 @@ Other enhancements - :class:`pandas.api.typing.NoDefault` is available for typing ``no_default`` - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`) +- :func:`pandas.merge`, :meth:`DataFrame.merge` and :meth:`DataFrame.join` now support anti joins (``left_anti`` and ``right_anti``) in the ``how`` parameter (:issue:`42916`) - :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`) - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) - :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`). diff --git a/pandas/_typing.py b/pandas/_typing.py index b515305fb6903..4365ee85f72e3 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -442,7 +442,9 @@ def closed(self) -> bool: AnyAll = Literal["any", "all"] # merge -MergeHow = Literal["left", "right", "inner", "outer", "cross"] +MergeHow = Literal[ + "left", "right", "inner", "outer", "cross", "left_anti", "right_anti" +] MergeValidate = Literal[ "one_to_one", "1:1", diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b36791e6e1dd1..72fc099f57599 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -315,7 +315,8 @@ ----------%s right : DataFrame or named Series Object to merge with. -how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' +how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'}, + default 'inner' Type of merge to be performed. * left: use only keys from left frame, similar to a SQL left outer join; @@ -328,6 +329,10 @@ join; preserve the order of the left keys. * cross: creates the cartesian product from both frames, preserves the order of the left keys. + * left_anti: use only keys from left frame that are not in right frame, similar + to SQL left anti join; preserve key order. + * right_anti: use only keys from right frame that are not in left frame, similar + to SQL right anti join; preserve key order. on : label or list Column or index level names to join on. These must be found in both DataFrames. If `on` is None and not merging on indexes then this defaults @@ -10613,7 +10618,8 @@ def join( values given, the `other` DataFrame must have a MultiIndex. Can pass an array as the join key if it is not already contained in the calling DataFrame. Like an Excel VLOOKUP operation. - how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'left' + how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'}, + default 'left' How to handle the operation of the two objects. * left: use calling frame's index (or column if on is specified) @@ -10625,6 +10631,10 @@ def join( of the calling's one. * cross: creates the cartesian product from both frames, preserves the order of the left keys. + * left_anti: use set difference of calling frame's index and `other`'s + index. + * right_anti: use set difference of `other`'s index and calling frame's + index. lsuffix : str, default '' Suffix to use from left frame's overlapping columns. rsuffix : str, default '' diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index ab056c8cc7e37..09be82c59a5c6 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -180,7 +180,8 @@ def merge( First pandas object to merge. right : DataFrame or named Series Second pandas object to merge. - how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' + how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti}, + default 'inner' Type of merge to be performed. * left: use only keys from left frame, similar to a SQL left outer join; @@ -193,6 +194,10 @@ def merge( join; preserve the order of the left keys. * cross: creates the cartesian product from both frames, preserves the order of the left keys. + * left_anti: use only keys from left frame that are not in right frame, similar + to SQL left anti join; preserve key order. + * right_anti: use only keys from right frame that are not in left frame, similar + to SQL right anti join; preserve key order. on : label or list Column or index level names to join on. These must be found in both DataFrames. If `on` is None and not merging on indexes then this defaults @@ -953,7 +958,7 @@ def __init__( self, left: DataFrame | Series, right: DataFrame | Series, - how: JoinHow | Literal["asof"] = "inner", + how: JoinHow | Literal["left_anti", "right_anti", "asof"] = "inner", on: IndexLabel | AnyArrayLike | None = None, left_on: IndexLabel | AnyArrayLike | None = None, right_on: IndexLabel | AnyArrayLike | None = None, @@ -968,7 +973,7 @@ def __init__( _right = _validate_operand(right) self.left = self.orig_left = _left self.right = self.orig_right = _right - self.how = how + self.how, self.anti_join = self._validate_how(how) self.on = com.maybe_make_list(on) @@ -998,14 +1003,6 @@ def __init__( ) raise MergeError(msg) - # GH 59435: raise when "how" is not a valid Merge type - merge_type = {"left", "right", "inner", "outer", "cross", "asof"} - if how not in merge_type: - raise ValueError( - f"'{how}' is not a valid Merge type: " - f"left, right, inner, outer, cross, asof" - ) - self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on) ( @@ -1035,6 +1032,37 @@ def __init__( if validate is not None: self._validate_validate_kwd(validate) + @final + def _validate_how( + self, how: JoinHow | Literal["left_anti", "right_anti", "asof"] + ) -> tuple[JoinHow | Literal["asof"], bool]: + """ + Validate the 'how' parameter and return the actual join type and whether + this is an anti join. + """ + # GH 59435: raise when "how" is not a valid Merge type + merge_type = { + "left", + "right", + "inner", + "outer", + "left_anti", + "right_anti", + "cross", + "asof", + } + if how not in merge_type: + raise ValueError( + f"'{how}' is not a valid Merge type: " + f"left, right, inner, outer, left_anti, right_anti, cross, asof" + ) + anti_join = False + if how in {"left_anti", "right_anti"}: + how = how.split("_")[0] # type: ignore[assignment] + anti_join = True + how = cast(JoinHow | Literal["asof"], how) + return how, anti_join + def _maybe_require_matching_dtypes( self, left_join_keys: list[ArrayLike], right_join_keys: list[ArrayLike] ) -> None: @@ -1405,6 +1433,11 @@ def _get_join_info( n = len(left_ax) if left_indexer is None else len(left_indexer) join_index = default_index(n) + if self.anti_join: + join_index, left_indexer, right_indexer = self._handle_anti_join( + join_index, left_indexer, right_indexer + ) + return join_index, left_indexer, right_indexer @final @@ -1447,6 +1480,48 @@ def _create_join_index( return index.copy() return index.take(indexer) + @final + def _handle_anti_join( + self, + join_index: Index, + left_indexer: npt.NDArray[np.intp] | None, + right_indexer: npt.NDArray[np.intp] | None, + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + """ + Handle anti join by returning the correct join index and indexers + + Parameters + ---------- + join_index : Index + join index + left_indexer : np.ndarray[np.intp] or None + left indexer + right_indexer : np.ndarray[np.intp] or None + right indexer + + Returns + ------- + Index, np.ndarray[np.intp] or None, np.ndarray[np.intp] or None + """ + # Make sure indexers are not None + if left_indexer is None: + left_indexer = np.arange(len(self.left)) + if right_indexer is None: + right_indexer = np.arange(len(self.right)) + + assert self.how in {"left", "right"} + if self.how == "left": + # Filter to rows where left keys are not in right keys + filt = right_indexer == -1 + else: + # Filter to rows where right keys are not in left keys + filt = left_indexer == -1 + join_index = join_index[filt] + left_indexer = left_indexer[filt] + right_indexer = right_indexer[filt] + + return join_index, left_indexer, right_indexer + @final def _get_merge_keys( self, diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 479ea7d7ba692..aaa9485cab580 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -277,7 +277,20 @@ def test_join_index(float_frame): tm.assert_index_equal(joined.index, float_frame.index.sort_values()) tm.assert_index_equal(joined.columns, expected_columns) - join_msg = "'foo' is not a valid Merge type: left, right, inner, outer, cross, asof" + # left anti + joined = f.join(f2, how="left_anti") + tm.assert_index_equal(joined.index, float_frame.index[:5]) + tm.assert_index_equal(joined.columns, expected_columns) + + # right anti + joined = f.join(f2, how="right_anti") + tm.assert_index_equal(joined.index, float_frame.index[10:][::-1]) + tm.assert_index_equal(joined.columns, expected_columns) + + join_msg = ( + "'foo' is not a valid Merge type: left, right, inner, outer, " + "left_anti, right_anti, cross, asof" + ) with pytest.raises(ValueError, match=re.escape(join_msg)): f.join(f2, how="foo") diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f0abc1afc6ab0..f0f67aebd85ec 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1464,7 +1464,10 @@ def test_merge_how_validation(self): data2 = DataFrame( np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"] ) - msg = "'full' is not a valid Merge type: left, right, inner, outer, cross, asof" + msg = ( + "'full' is not a valid Merge type: left, right, inner, outer, " + "left_anti, right_anti, cross, asof" + ) with pytest.raises(ValueError, match=re.escape(msg)): data1.merge(data2, how="full") diff --git a/pandas/tests/reshape/merge/test_merge_antijoin.py b/pandas/tests/reshape/merge/test_merge_antijoin.py new file mode 100644 index 0000000000000..006622c6e5e94 --- /dev/null +++ b/pandas/tests/reshape/merge/test_merge_antijoin.py @@ -0,0 +1,280 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + MultiIndex, +) +import pandas._testing as tm +from pandas.core.reshape.merge import merge + + +def test_merge_antijoin(): + # GH#42916 + left = DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"]) + right = DataFrame({"B": [1, 2, 4]}, index=["a", "b", "d"]) + + result = merge(left, right, how="left_anti", left_index=True, right_index=True) + expected = DataFrame({"A": [3], "B": [np.nan]}, index=["c"]) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_index=True, right_index=True) + expected = DataFrame({"A": [np.nan], "B": [4]}, index=["d"]) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_on_different_columns(): + left = DataFrame({"A": [1.0, 2.0, 3.0], "B": ["a", "b", "c"]}).astype({"B": object}) + right = DataFrame({"C": [1.0, 2.0, 4.0], "D": ["a", "d", "b"]}).astype( + {"D": object} + ) + + result = merge(left, right, how="left_anti", left_on="B", right_on="D") + expected = DataFrame( + { + "A": [3.0], + "B": ["c"], + "C": [np.nan], + "D": [np.nan], + }, + index=[2], + ).astype({"B": object, "D": object}) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="B", right_on="D") + expected = DataFrame( + { + "A": [np.nan], + "B": [np.nan], + "C": [2.0], + "D": ["d"], + }, + index=[1], + ).astype({"B": object, "D": object}) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_nonunique_keys(): + left = DataFrame({"A": [1.0, 2.0, 3.0], "B": ["a", "b", "b"]}).astype({"B": object}) + right = DataFrame({"C": [1.0, 2.0, 4.0], "D": ["b", "d", "d"]}).astype( + {"D": object} + ) + + result = merge(left, right, how="left_anti", left_on="B", right_on="D") + expected = DataFrame( + { + "A": [1.0], + "B": ["a"], + "C": [np.nan], + "D": [np.nan], + }, + index=[0], + ).astype({"B": object, "D": object}) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="B", right_on="D") + expected = DataFrame( + { + "A": [np.nan, np.nan], + "B": [np.nan, np.nan], + "C": [2.0, 4.0], + "D": ["d", "d"], + }, + index=[2, 3], + ).astype({"B": object, "D": object}) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_same_df(): + left = DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"], dtype=np.int64) + result = merge(left, left, how="left_anti", left_index=True, right_index=True) + expected = DataFrame([], columns=["A_x", "A_y"], dtype=np.int64) + tm.assert_frame_equal(result, expected, check_index_type=False) + + +def test_merge_antijoin_nans(): + left = DataFrame({"A": [1.0, 2.0, np.nan], "C": ["a", "b", "c"]}).astype( + {"C": object} + ) + right = DataFrame({"A": [3.0, 2.0, np.nan], "D": ["d", "e", "f"]}).astype( + {"D": object} + ) + result = merge(left, right, how="left_anti", on="A") + expected = DataFrame({"A": [1.0], "C": ["a"], "D": [np.nan]}).astype( + {"C": object, "D": object} + ) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_on_datetime64tz(): + # GH11405 + left = DataFrame( + { + "key": pd.date_range("20151010", periods=2, tz="US/Eastern"), + "value": [1.0, 2.0], + } + ) + right = DataFrame( + { + "key": pd.date_range("20151011", periods=3, tz="US/Eastern"), + "value": [1.0, 2.0, 3.0], + } + ) + + expected = DataFrame( + { + "key": pd.date_range("20151010", periods=1, tz="US/Eastern"), + "value_x": [1.0], + "value_y": [np.nan], + }, + index=[0], + ) + result = merge(left, right, on="key", how="left_anti") + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + { + "key": pd.date_range("20151012", periods=2, tz="US/Eastern"), + "value_x": [np.nan, np.nan], + "value_y": [2.0, 3.0], + }, + index=[1, 2], + ) + result = merge(left, right, on="key", how="right_anti") + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_multiindex(): + left = DataFrame( + { + "A": [1, 2, 3], + "B": [4, 5, 6], + }, + index=MultiIndex.from_tuples( + [("a", "x"), ("b", "y"), ("c", "z")], names=["first", "second"] + ), + ) + right = DataFrame( + { + "C": [7, 8, 9], + "D": [10, 11, 12], + }, + index=MultiIndex.from_tuples( + [("a", "x"), ("b", "y"), ("c", "w")], names=["first", "second"] + ), + ) + + result = merge(left, right, how="left_anti", left_index=True, right_index=True) + expected = DataFrame( + { + "A": [3], + "B": [6], + "C": [np.nan], + "D": [np.nan], + }, + index=MultiIndex.from_tuples([("c", "z")], names=["first", "second"]), + ) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_index=True, right_index=True) + expected = DataFrame( + { + "A": [np.nan], + "B": [np.nan], + "C": [9], + "D": [12], + }, + index=MultiIndex.from_tuples([("c", "w")], names=["first", "second"]), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype", + [ + "Int64", + pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("timestamp[s][pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], +) +def test_merge_antijoin_extension_dtype(dtype): + left = DataFrame( + { + "join_col": [1, 3, 5], + "left_val": [1, 2, 3], + } + ) + right = DataFrame( + { + "join_col": [2, 3, 4], + "right_val": [1, 2, 3], + } + ) + left = left.astype({"join_col": dtype}) + right = right.astype({"join_col": dtype}) + result = merge(left, right, how="left_anti", on="join_col") + expected = DataFrame( + { + "join_col": [1, 5], + "left_val": [1, 3], + "right_val": [np.nan, np.nan], + }, + index=[0, 2], + ) + expected = expected.astype({"join_col": dtype}) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_empty_dataframe(): + left = DataFrame({"A": [], "B": []}) + right = DataFrame({"C": [], "D": []}) + + result = merge(left, right, how="left_anti", left_on="A", right_on="C") + expected = DataFrame({"A": [], "B": [], "C": [], "D": []}) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="A", right_on="C") + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_no_common_elements(): + left = DataFrame({"A": [1, 2, 3]}) + right = DataFrame({"B": [4, 5, 6]}) + + result = merge(left, right, how="left_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [1, 2, 3], "B": [np.nan, np.nan, np.nan]}) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [np.nan, np.nan, np.nan], "B": [4, 5, 6]}) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_with_null_values(): + left = DataFrame({"A": [1.0, 2.0, None, 4.0]}) + right = DataFrame({"B": [2.0, None, 5.0]}) + + result = merge(left, right, how="left_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [1.0, 4.0], "B": [np.nan, np.nan]}, index=[0, 3]) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [np.nan], "B": [5.0]}, index=[2]) + tm.assert_frame_equal(result, expected) + + +def test_merge_antijoin_with_mixed_dtypes(): + left = DataFrame({"A": [1, "2", 3.0]}) + right = DataFrame({"B": ["2", 3.0, 4]}) + + result = merge(left, right, how="left_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [1], "B": [np.nan]}, dtype=object) + tm.assert_frame_equal(result, expected) + + result = merge(left, right, how="right_anti", left_on="A", right_on="B") + expected = DataFrame({"A": [np.nan], "B": [4]}, dtype=object, index=[2]) + tm.assert_frame_equal(result, expected) From 51187b38c83d52a944434abe54cc0684d116608a Mon Sep 17 00:00:00 2001 From: Trinh Quoc Anh Date: Wed, 5 Feb 2025 20:32:52 +0100 Subject: [PATCH 29/68] Disallow empty comment (#60858) --- pandas/tests/extension/decimal/array.py | 1 - pandas/tests/scalar/period/test_period.py | 5 ----- pandas/tests/scalar/timestamp/methods/test_round.py | 1 - pyproject.toml | 2 -- 4 files changed, 9 deletions(-) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 59f313b4c9edb..2ee6a73ec4054 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -125,7 +125,6 @@ def to_numpy( return result def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): - # if not all( isinstance(t, self._HANDLED_TYPES + (DecimalArray,)) for t in inputs ): diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index fe51817a78be8..baaedaa853565 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -991,7 +991,6 @@ def test_properties_quarterly(self): qedec_date = Period(freq="Q-DEC", year=2007, quarter=1) qejan_date = Period(freq="Q-JAN", year=2007, quarter=1) qejun_date = Period(freq="Q-JUN", year=2007, quarter=1) - # for x in range(3): for qd in (qedec_date, qejan_date, qejun_date): assert (qd + x).qyear == 2007 @@ -1016,7 +1015,6 @@ def test_properties_monthly(self): def test_properties_weekly(self): # Test properties on Periods with daily frequency. w_date = Period(freq="W", year=2007, month=1, day=7) - # assert w_date.year == 2007 assert w_date.quarter == 1 assert w_date.month == 1 @@ -1046,7 +1044,6 @@ def test_properties_daily(self): # Test properties on Periods with daily frequency. with tm.assert_produces_warning(FutureWarning, match=bday_msg): b_date = Period(freq="B", year=2007, month=1, day=1) - # assert b_date.year == 2007 assert b_date.quarter == 1 assert b_date.month == 1 @@ -1089,7 +1086,6 @@ def test_properties_hourly(self): def test_properties_minutely(self): # Test properties on Periods with minutely frequency. t_date = Period(freq="Min", year=2007, month=1, day=1, hour=0, minute=0) - # assert t_date.quarter == 1 assert t_date.month == 1 assert t_date.day == 1 @@ -1108,7 +1104,6 @@ def test_properties_secondly(self): s_date = Period( freq="Min", year=2007, month=1, day=1, hour=0, minute=0, second=0 ) - # assert s_date.year == 2007 assert s_date.quarter == 1 assert s_date.month == 1 diff --git a/pandas/tests/scalar/timestamp/methods/test_round.py b/pandas/tests/scalar/timestamp/methods/test_round.py index 944aa55727217..6b27e5e6c5554 100644 --- a/pandas/tests/scalar/timestamp/methods/test_round.py +++ b/pandas/tests/scalar/timestamp/methods/test_round.py @@ -165,7 +165,6 @@ def test_round_dst_border_ambiguous(self, method, unit): # GH 18946 round near "fall back" DST ts = Timestamp("2017-10-29 00:00:00", tz="UTC").tz_convert("Europe/Madrid") ts = ts.as_unit(unit) - # result = getattr(ts, method)("h", ambiguous=True) assert result == ts assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value diff --git a/pyproject.toml b/pyproject.toml index c6af69438f849..665d0c93d2918 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -362,8 +362,6 @@ ignore = [ "PLR1733", # 5 errors, it seems like we wannt to ignore these # Unnecessary lookup of list item by index "PLR1736", # 4 errors, we're currently having inline pylint ignore - # empty-comment - "PLR2044", # autofixable # Unpacking a dictionary in iteration without calling `.items()` "PLE1141", # autofixable # import-outside-toplevel From 70edaa0b4661df6f251f2e3d3ae5c55ef371fc74 Mon Sep 17 00:00:00 2001 From: Trinh Quoc Anh Date: Wed, 5 Feb 2025 22:46:22 +0100 Subject: [PATCH 30/68] Remove unused code (#60860) --- pandas/core/computation/parsing.py | 44 ------------------------------ 1 file changed, 44 deletions(-) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index 35a6d1c6ad269..8441941797a6e 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -123,16 +123,6 @@ def clean_column_name(name: Hashable) -> Hashable: ------- name : hashable Returns the name after tokenizing and cleaning. - - Notes - ----- - For some cases, a name cannot be converted to a valid Python identifier. - In that case :func:`tokenize_string` raises a SyntaxError. - In that case, we just return the name unmodified. - - If this name was used in the query string (this makes the query call impossible) - an error will be raised by :func:`tokenize_backtick_quoted_string` instead, - which is not caught and propagates to the user level. """ try: # Escape backticks @@ -145,40 +135,6 @@ def clean_column_name(name: Hashable) -> Hashable: return name -def tokenize_backtick_quoted_string( - token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int -) -> tuple[int, str]: - """ - Creates a token from a backtick quoted string. - - Moves the token_generator forwards till right after the next backtick. - - Parameters - ---------- - token_generator : Iterator[tokenize.TokenInfo] - The generator that yields the tokens of the source string (Tuple[int, str]). - The generator is at the first token after the backtick (`) - - source : str - The Python source code string. - - string_start : int - This is the start of backtick quoted string inside the source string. - - Returns - ------- - tok: Tuple[int, str] - The token that represents the backtick quoted string. - The integer is equal to BACKTICK_QUOTED_STRING (100). - """ - for _, tokval, start, _, _ in token_generator: - if tokval == "`": - string_end = start[1] - break - - return BACKTICK_QUOTED_STRING, source[string_start:string_end] - - class ParseState(Enum): DEFAULT = 0 IN_BACKTICK = 1 From 86c266840aef8dd7d7b692b385aa8ecbaf0371b8 Mon Sep 17 00:00:00 2001 From: Trinh Quoc Anh Date: Thu, 6 Feb 2025 18:15:34 +0100 Subject: [PATCH 31/68] Enable get-attr-with-constant (B009) (#60862) * Enable get-attr-with-constant (B009) * Enable get-attr-with-constant (B009) --- pandas/core/arraylike.py | 2 +- pandas/core/common.py | 2 +- pandas/io/formats/style.py | 10 ++++------ pandas/tests/generic/test_finalize.py | 2 +- pandas/tests/groupby/test_apply.py | 2 +- pandas/tests/groupby/test_groupby.py | 2 +- pandas/tests/io/pytables/test_store.py | 2 +- pyproject.toml | 2 -- 8 files changed, 10 insertions(+), 14 deletions(-) diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index 43ac69508d1a4..51ddd9e91b227 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -329,7 +329,7 @@ def array_ufunc(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes)) if self.ndim == 1: - names = {getattr(x, "name") for x in inputs if hasattr(x, "name")} + names = {x.name for x in inputs if hasattr(x, "name")} name = names.pop() if len(names) == 1 else None reconstruct_kwargs = {"name": name} else: diff --git a/pandas/core/common.py b/pandas/core/common.py index 9788ec972ba1b..100ad312bd839 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -359,7 +359,7 @@ def is_full_slice(obj, line: int) -> bool: def get_callable_name(obj): # typical case has name if hasattr(obj, "__name__"): - return getattr(obj, "__name__") + return obj.__name__ # some objects don't; could recurse if isinstance(obj, partial): return get_callable_name(obj.func) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index b4c55da3eddd6..f2ec41d2c6a43 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -2021,7 +2021,7 @@ def apply( more details. """ self._todo.append( - (lambda instance: getattr(instance, "_apply"), (func, axis, subset), kwargs) + (lambda instance: instance._apply, (func, axis, subset), kwargs) ) return self @@ -2128,7 +2128,7 @@ def apply_index( """ self._todo.append( ( - lambda instance: getattr(instance, "_apply_index"), + lambda instance: instance._apply_index, (func, axis, level, "apply"), kwargs, ) @@ -2157,7 +2157,7 @@ def map_index( ) -> Styler: self._todo.append( ( - lambda instance: getattr(instance, "_apply_index"), + lambda instance: instance._apply_index, (func, axis, level, "map"), kwargs, ) @@ -2230,9 +2230,7 @@ def map(self, func: Callable, subset: Subset | None = None, **kwargs) -> Styler: See `Table Visualization <../../user_guide/style.ipynb>`_ user guide for more details. """ - self._todo.append( - (lambda instance: getattr(instance, "_map"), (func, subset), kwargs) - ) + self._todo.append((lambda instance: instance._map, (func, subset), kwargs)) return self def set_table_attributes(self, attributes: str) -> Styler: diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 433e559ef620e..a88090b00499d 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -644,7 +644,7 @@ def test_timedelta_methods(method): operator.methodcaller("add_categories", ["c"]), operator.methodcaller("as_ordered"), operator.methodcaller("as_unordered"), - lambda x: getattr(x, "codes"), + lambda x: x.codes, operator.methodcaller("remove_categories", "a"), operator.methodcaller("remove_unused_categories"), operator.methodcaller("rename_categories", {"a": "A", "b": "B"}), diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 294ab14c96de8..5bf16ee9ad0b8 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1390,7 +1390,7 @@ def test_empty_df(method, op): # GH 47985 empty_df = DataFrame({"a": [], "b": []}) gb = empty_df.groupby("a", group_keys=True) - group = getattr(gb, "b") + group = gb.b result = getattr(group, method)(op) expected = Series( diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 5bae9b1fd9882..d0ce27b4a22f8 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -264,7 +264,7 @@ def test_attr_wrapper(ts): # make sure raises error msg = "'SeriesGroupBy' object has no attribute 'foo'" with pytest.raises(AttributeError, match=msg): - getattr(grouped, "foo") + grouped.foo def test_frame_groupby(tsframe): diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index a6fe9529c594a..2bfe9e33a6235 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -311,7 +311,7 @@ def test_getattr(setup_path): # test attribute access result = store.a tm.assert_series_equal(result, s) - result = getattr(store, "a") + result = store.a tm.assert_series_equal(result, s) df = DataFrame( diff --git a/pyproject.toml b/pyproject.toml index 665d0c93d2918..b7d53b0d8934a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -272,8 +272,6 @@ ignore = [ "B007", # controversial "B008", - # setattr is used to side-step mypy - "B009", # getattr is used to side-step mypy "B010", # tests use comparisons but not their returned value From 07d299343601cd6692d7a6c5adc74b274fff260f Mon Sep 17 00:00:00 2001 From: "Christine P. Chai" Date: Thu, 6 Feb 2025 09:53:07 -0800 Subject: [PATCH 32/68] DOC: Correct a typo in pyarrow.rst (#60865) --- doc/source/user_guide/pyarrow.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/pyarrow.rst b/doc/source/user_guide/pyarrow.rst index aecbce0441b53..1807341530e69 100644 --- a/doc/source/user_guide/pyarrow.rst +++ b/doc/source/user_guide/pyarrow.rst @@ -22,7 +22,7 @@ Data Structure Integration A :class:`Series`, :class:`Index`, or the columns of a :class:`DataFrame` can be directly backed by a :external+pyarrow:py:class:`pyarrow.ChunkedArray` which is similar to a NumPy array. To construct these from the main pandas data structures, you can pass in a string of the type followed by -``[pyarrow]``, e.g. ``"int64[pyarrow]""`` into the ``dtype`` parameter +``[pyarrow]``, e.g. ``"int64[pyarrow]"`` into the ``dtype`` parameter .. ipython:: python From 3979e954a339db9fc5e99b72ccb5ceda081c33e5 Mon Sep 17 00:00:00 2001 From: "Christine P. Chai" Date: Thu, 6 Feb 2025 10:28:49 -0800 Subject: [PATCH 33/68] DOC: Update the read_csv in action in cookbook.rst (#60866) --- doc/source/user_guide/cookbook.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index b3decb6342527..91a0b4a4fe967 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -1043,7 +1043,7 @@ CSV The :ref:`CSV ` docs -`read_csv in action `__ +`read_csv in action `__ `appending to a csv `__ From d67055fee6654566d4ecb7cce66a8123f02c8323 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 7 Feb 2025 23:57:02 +0530 Subject: [PATCH 34/68] DOC: fix ES01 for pandas.get_option (#60868) --- pandas/_config/config.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 0d06e6fa8e96c..8d1f61178830f 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -141,6 +141,10 @@ def get_option(pat: str) -> Any: """ Retrieve the value of the specified option. + This method allows users to query the current value of a given option + in the pandas configuration system. Options control various display, + performance, and behavior-related settings within pandas. + Parameters ---------- pat : str From 408abda757215e65e91b313d6b91b4db8ca799e8 Mon Sep 17 00:00:00 2001 From: "Christine P. Chai" Date: Fri, 7 Feb 2025 10:54:27 -0800 Subject: [PATCH 35/68] DOC: Correct a typo in ecosystem.md (#60874) --- web/pandas/community/ecosystem.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 29297488da64f..876e6e5b298ea 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -8,7 +8,7 @@ developers to build powerful and more focused data tools. The creation of libraries that complement pandas' functionality also allows pandas development to remain focused around its original requirements. -This is an community-maintained list of projects that build on pandas in order +This is a community-maintained list of projects that build on pandas in order to provide tools in the PyData space. The pandas core development team does not necessarily endorse any particular project on this list or have any knowledge of the maintenance status of any particular library. For a more complete list of projects that depend on pandas, see the [libraries.io usage page for From c5ea5248d8e1c3ec4e0414480e574b5f6710e377 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 7 Feb 2025 11:07:33 -0800 Subject: [PATCH 36/68] TST/CI: xfail test_frame_setitem_dask_array_into_new_col for numpy>2.1 (#60873) --- pandas/tests/test_downstream.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 18df76ddd8ed8..76fad35304fe6 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -20,6 +20,7 @@ TimedeltaIndex, ) import pandas._testing as tm +from pandas.util.version import Version @pytest.fixture @@ -222,7 +223,7 @@ def test_missing_required_dependency(): assert name in output -def test_frame_setitem_dask_array_into_new_col(): +def test_frame_setitem_dask_array_into_new_col(request): # GH#47128 # dask sets "compute.use_numexpr" to False, so catch the current value @@ -230,7 +231,14 @@ def test_frame_setitem_dask_array_into_new_col(): olduse = pd.get_option("compute.use_numexpr") try: + dask = pytest.importorskip("dask") da = pytest.importorskip("dask.array") + if Version(dask.__version__) <= Version("2025.1.0") and Version( + np.__version__ + ) >= Version("2.1"): + request.applymarker( + pytest.mark.xfail(reason="loc.__setitem__ incorrectly mutated column c") + ) dda = da.array([1, 2]) df = DataFrame({"a": ["a", "b"]}) From 0d85d57b18b18e6b216ff081eac0952cb27d0e13 Mon Sep 17 00:00:00 2001 From: "Christine P. Chai" Date: Fri, 7 Feb 2025 11:50:02 -0800 Subject: [PATCH 37/68] DOC: Update the Numba troubleshooting URL (#60877) --- doc/source/user_guide/enhancingperf.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index c4721f3a6b09c..e55a6cda47ac2 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -427,7 +427,7 @@ prefer that Numba throw an error if it cannot compile a function in a way that speeds up your code, pass Numba the argument ``nopython=True`` (e.g. ``@jit(nopython=True)``). For more on troubleshooting Numba modes, see the `Numba troubleshooting page -`__. +`__. Using ``parallel=True`` (e.g. ``@jit(parallel=True)``) may result in a ``SIGABRT`` if the threading layer leads to unsafe behavior. You can first `specify a safe threading layer `__ From 5b16c06286000d923e35a1e4384e4bc7732a7691 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 7 Feb 2025 12:25:03 -0800 Subject: [PATCH 38/68] TST/CI: Address enforced numpy DeprecationWarning in test_pandas_dtype_numpy_warning (#60875) TST: Address enforced numpy DeprecationWarning in test_pandas_dtype_numpy_warning --- pandas/core/dtypes/common.py | 2 ++ pandas/tests/dtypes/test_common.py | 18 +++++++++++++----- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index b0c8ec1ffc083..e8881ff014a0c 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1836,6 +1836,8 @@ def pandas_dtype(dtype) -> DtypeObj: # raise a consistent TypeError if failed try: with warnings.catch_warnings(): + # TODO: warnings.catch_warnings can be removed when numpy>2.2.2 + # is the minimum version # GH#51523 - Series.astype(np.integer) doesn't show # numpy deprecation warning of np.integer # Hence enabling DeprecationWarning diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index fa48393dd183e..2bda2fddec2ff 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -22,6 +22,7 @@ import pandas._testing as tm from pandas.api.types import pandas_dtype from pandas.arrays import SparseArray +from pandas.util.version import Version # EA & Actual Dtypes @@ -788,11 +789,18 @@ def test_validate_allhashable(): def test_pandas_dtype_numpy_warning(): # GH#51523 - with tm.assert_produces_warning( - DeprecationWarning, - check_stacklevel=False, - match="Converting `np.integer` or `np.signedinteger` to a dtype is deprecated", - ): + if Version(np.__version__) <= Version("2.2.2"): + ctx = tm.assert_produces_warning( + DeprecationWarning, + check_stacklevel=False, + match=( + "Converting `np.integer` or `np.signedinteger` to a dtype is deprecated" + ), + ) + else: + ctx = tm.external_error_raised(TypeError) + + with ctx: pandas_dtype(np.integer) From 0a1577f2f0d1c9efda4b7b2d616177691d1ca73b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 7 Feb 2025 15:32:39 -0800 Subject: [PATCH 39/68] TST/CI: skipif numba tests on Ubuntu ARM for numba 0.61 (#60847) * TST: Apply skip/xfail markers for numba/dask updates * skip test_numba_vs_python_noop * Add reason * Skip another test * Append a skipif * Skip test_info_compute_numba * add skipif for test_numba * Add skipif on test_numba * Add skipif for transform/test_numba * Remove redundant condition in dask test * Add skipif to window/test_numba * skipif for test_online --- pandas/tests/apply/test_frame_apply.py | 10 ++++++++++ pandas/tests/apply/test_numba.py | 12 +++++++++++- pandas/tests/frame/methods/test_info.py | 6 +++++- pandas/tests/groupby/aggregate/test_numba.py | 12 +++++++++++- pandas/tests/groupby/test_numba.py | 13 +++++++++++-- pandas/tests/groupby/transform/test_numba.py | 12 +++++++++++- pandas/tests/window/test_numba.py | 12 +++++++++++- pandas/tests/window/test_online.py | 13 +++++++++++-- 8 files changed, 81 insertions(+), 9 deletions(-) diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index d36d723c4be6a..b9e407adc3051 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm + from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -16,6 +18,7 @@ ) import pandas._testing as tm from pandas.tests.frame.common import zip_frames +from pandas.util.version import Version @pytest.fixture @@ -65,6 +68,13 @@ def test_apply(float_frame, engine, request): @pytest.mark.parametrize("raw", [True, False]) @pytest.mark.parametrize("nopython", [True, False]) def test_apply_args(float_frame, axis, raw, engine, nopython): + numba = pytest.importorskip("numba") + if ( + engine == "numba" + and Version(numba.__version__) == Version("0.61") + and is_platform_arm() + ): + pytest.skip(f"Segfaults on ARM platforms with numba {numba.__version__}") engine_kwargs = {"nopython": nopython} result = float_frame.apply( lambda x, y: x + y, diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index d6cd9c321ace6..75bc3f5b74b9d 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm import pandas.util._test_decorators as td import pandas as pd @@ -9,8 +10,17 @@ Index, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = [td.skip_if_no("numba"), pytest.mark.single_cpu] +pytestmark = [td.skip_if_no("numba"), pytest.mark.single_cpu, pytest.mark.skipif()] + +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) @pytest.fixture(params=[0, 1]) diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index 462d86cadde88..de6737ec3bc39 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -11,6 +11,7 @@ HAS_PYARROW, IS64, PYPY, + is_platform_arm, ) from pandas import ( @@ -23,6 +24,7 @@ option_context, ) import pandas._testing as tm +from pandas.util.version import Version @pytest.fixture @@ -544,7 +546,9 @@ def test_memory_usage_empty_no_warning(using_infer_string): @pytest.mark.single_cpu def test_info_compute_numba(): # GH#51922 - pytest.importorskip("numba") + numba = pytest.importorskip("numba") + if Version(numba.__version__) == Version("0.61") and is_platform_arm(): + pytest.skip(f"Segfaults on ARM platforms with numba {numba.__version__}") df = DataFrame([[1, 2], [3, 4]]) with option_context("compute.use_numba", True): diff --git a/pandas/tests/groupby/aggregate/test_numba.py b/pandas/tests/groupby/aggregate/test_numba.py index 0cd8a14d97eb0..afddc90fdd055 100644 --- a/pandas/tests/groupby/aggregate/test_numba.py +++ b/pandas/tests/groupby/aggregate/test_numba.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm from pandas.errors import NumbaUtilError from pandas import ( @@ -11,8 +12,17 @@ option_context, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] + +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) def test_correct_function_signature(): diff --git a/pandas/tests/groupby/test_numba.py b/pandas/tests/groupby/test_numba.py index 3e32031e51138..082319d8479f0 100644 --- a/pandas/tests/groupby/test_numba.py +++ b/pandas/tests/groupby/test_numba.py @@ -1,15 +1,24 @@ import pytest +from pandas.compat import is_platform_arm + from pandas import ( DataFrame, Series, option_context, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] -pytest.importorskip("numba") +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) @pytest.mark.filterwarnings("ignore") diff --git a/pandas/tests/groupby/transform/test_numba.py b/pandas/tests/groupby/transform/test_numba.py index 969df8ef4c52b..e19b7592f75b3 100644 --- a/pandas/tests/groupby/transform/test_numba.py +++ b/pandas/tests/groupby/transform/test_numba.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm from pandas.errors import NumbaUtilError from pandas import ( @@ -9,8 +10,17 @@ option_context, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] + +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) def test_correct_function_signature(): diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 120dbe788a23f..887aeca6590dc 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm from pandas.errors import NumbaUtilError import pandas.util._test_decorators as td @@ -11,8 +12,17 @@ to_datetime, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] + +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) @pytest.fixture(params=["single", "table"]) diff --git a/pandas/tests/window/test_online.py b/pandas/tests/window/test_online.py index 14d3a39107bc4..43d55a7992b3c 100644 --- a/pandas/tests/window/test_online.py +++ b/pandas/tests/window/test_online.py @@ -1,15 +1,24 @@ import numpy as np import pytest +from pandas.compat import is_platform_arm + from pandas import ( DataFrame, Series, ) import pandas._testing as tm +from pandas.util.version import Version -pytestmark = pytest.mark.single_cpu +pytestmark = [pytest.mark.single_cpu] -pytest.importorskip("numba") +numba = pytest.importorskip("numba") +pytestmark.append( + pytest.mark.skipif( + Version(numba.__version__) == Version("0.61") and is_platform_arm(), + reason=f"Segfaults on ARM platforms with numba {numba.__version__}", + ) +) @pytest.mark.filterwarnings("ignore") From 3da2c1c14e8ad55d8cd22efee75e8477e4403997 Mon Sep 17 00:00:00 2001 From: SebastianOuslis Date: Sat, 8 Feb 2025 07:52:14 -0500 Subject: [PATCH 40/68] DOC: Closed parameter not intuitively documented in DataFrame.rolling (#60844) --- pandas/core/groupby/groupby.py | 19 ++++++++++--------- pandas/core/window/rolling.py | 19 ++++++++++--------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 27865a60f6ea3..d0c0ed29b6d44 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3659,10 +3659,10 @@ def rolling( Parameters ---------- window : int, timedelta, str, offset, or BaseIndexer subclass - Size of the moving window. + Interval of the moving window. - If an integer, the fixed number of observations used for - each window. + If an integer, the delta between the start and end of each window. + The number of points in the window depends on the ``closed`` argument. If a timedelta, str, or offset, the time period of each window. Each window will be a variable sized based on the observations included in @@ -3709,17 +3709,18 @@ def rolling( closed : str, default None Determines the inclusivity of points in the window - If ``'right'``, (First, Last] the last point in the window + + If ``'right'``, uses the window (first, last] meaning the last point is included in the calculations. - If ``'left'``, [First, Last) the first point in the window + If ``'left'``, uses the window [first, last) meaning the first point is included in the calculations. - If ``'both'``, [First, Last] all points in the window - are included in the calculations. + If ``'both'``, uses the window [first, last] meaning all points in + the window are included in the calculations. - If ``'neither'``, (First, Last) the first and last points - in the window are excludedfrom calculations. + If ``'neither'``, uses the window (first, last) meaning the first + and last points in the window are excluded from calculations. () and [] are referencing open and closed set notation respetively. diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index b954ce2584c13..69fce8cf2137e 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -881,10 +881,10 @@ class Window(BaseWindow): Parameters ---------- window : int, timedelta, str, offset, or BaseIndexer subclass - Size of the moving window. + Interval of the moving window. - If an integer, the fixed number of observations used for - each window. + If an integer, the delta between the start and end of each window. + The number of points in the window depends on the ``closed`` argument. If a timedelta, str, or offset, the time period of each window. Each window will be a variable sized based on the observations included in @@ -930,17 +930,18 @@ class Window(BaseWindow): closed : str, default None Determines the inclusivity of points in the window - If ``'right'``, (First, Last] the last point in the window + + If ``'right'``, uses the window (first, last] meaning the last point is included in the calculations. - If ``'left'``, [First, Last) the first point in the window + If ``'left'``, uses the window [first, last) meaning the first point is included in the calculations. - If ``'both'``, [First, Last] all points in the window - are included in the calculations. + If ``'both'``, uses the window [first, last] meaning all points in + the window are included in the calculations. - If ``'neither'``, (First, Last) the first and last points - in the window are excludedfrom calculations. + If ``'neither'``, uses the window (first, last) meaning the first + and last points in the window are excluded from calculations. () and [] are referencing open and closed set notation respetively. From deafcf7ff3c7392f2153e7ed5faded5888131d4f Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sun, 9 Feb 2025 00:24:13 +0530 Subject: [PATCH 41/68] DOC: fix ES01 for pandas.option_context (#60887) --- pandas/_config/config.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 8d1f61178830f..ce53e05608ba7 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -433,6 +433,11 @@ def option_context(*args) -> Generator[None]: """ Context manager to temporarily set options in a ``with`` statement. + This method allows users to set one or more pandas options temporarily + within a controlled block. The previous options' values are restored + once the block is exited. This is useful when making temporary adjustments + to pandas' behavior without affecting the global state. + Parameters ---------- *args : str | object From 9001cb2f080fded5af773d2f7e46c37a74c37cd0 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sun, 9 Feb 2025 00:24:53 +0530 Subject: [PATCH 42/68] DOC: fix ES01 for pandas.DataFrame.shape (#60888) --- pandas/core/frame.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 72fc099f57599..57a7b9467a05e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1021,6 +1021,10 @@ def shape(self) -> tuple[int, int]: """ Return a tuple representing the dimensionality of the DataFrame. + Unlike the `len()` method, which only returns the number of rows, `shape` + provides both row and column counts, making it a more informative method for + understanding dataset size. + See Also -------- numpy.ndarray.shape : Tuple of array dimensions. From 6309d04d0397953732faa11a8ab5d6104a913b82 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sun, 9 Feb 2025 00:25:29 +0530 Subject: [PATCH 43/68] DOC: fix ES01 for pandas.array (#60889) --- pandas/core/construction.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 50088804e0245..ada492787a179 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -81,6 +81,10 @@ def array( """ Create an array. + This method constructs an array using pandas extension types when possible. + If `dtype` is specified, it determines the type of array returned. Otherwise, + pandas attempts to infer the appropriate dtype based on `data`. + Parameters ---------- data : Sequence of objects From c9598ac6fbf7af68daa6d1a481da8257f3b5cf38 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sun, 9 Feb 2025 00:26:12 +0530 Subject: [PATCH 44/68] DOC: fix ES01 for pandas.Period (#60890) --- pandas/_libs/tslibs/period.pyx | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 087d3119c36f2..bef1956996b4f 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2842,6 +2842,11 @@ class Period(_Period): """ Represents a period of time. + A `Period` represents a specific time span rather than a point in time. + Unlike `Timestamp`, which represents a single instant, a `Period` defines a + duration, such as a month, quarter, or year. The exact representation is + determined by the `freq` parameter. + Parameters ---------- value : Period, str, datetime, date or pandas.Timestamp, default None From bfbf991e52d317541b6956447e89e4c151da2986 Mon Sep 17 00:00:00 2001 From: ananiavito <48645073+ananiavito@users.noreply.github.com> Date: Sat, 8 Feb 2025 19:58:47 +0100 Subject: [PATCH 45/68] DOC: fix a few typos in the User Guide (#60884) * DOC: fix typo in scale.rst The word "datasets" was needlessly repeated. * DOC: Fix typo in timeseries.rst * DOC: Fix typo in timeseries.rst * DOC: Fix typo in groupby.rst pandas was uppercase * DOC: Fix typo in merging.rst * DOC: Fix typo in merging.rst * DOC: fix typo in io.rst --- doc/source/user_guide/groupby.rst | 2 +- doc/source/user_guide/io.rst | 2 +- doc/source/user_guide/merging.rst | 4 ++-- doc/source/user_guide/scale.rst | 2 +- doc/source/user_guide/timeseries.rst | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 4a32381a7de47..4ec34db6ed959 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -418,7 +418,7 @@ You can also include the grouping columns if you want to operate on them. .. note:: - The ``groupby`` operation in Pandas drops the ``name`` field of the columns Index object + The ``groupby`` operation in pandas drops the ``name`` field of the columns Index object after the operation. This change ensures consistency in syntax between different column selection methods within groupby operations. diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index daf323acff129..07d06f61b3fd6 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -990,7 +990,7 @@ Thousand separators For large numbers that have been written with a thousands separator, you can set the ``thousands`` keyword to a string of length 1 so that integers will be parsed -correctly: +correctly. By default, numbers with a thousands separator will be parsed as strings: diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index cfd2f40aa93a3..fb707674b4dbf 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -586,7 +586,7 @@ A string argument to ``indicator`` will use the value as the name for the indica Overlapping value columns ~~~~~~~~~~~~~~~~~~~~~~~~~ -The merge ``suffixes`` argument takes a tuple of list of strings to append to +The merge ``suffixes`` argument takes a tuple or list of strings to append to overlapping column names in the input :class:`DataFrame` to disambiguate the result columns: @@ -979,7 +979,7 @@ nearest key rather than equal keys. For each row in the ``left`` :class:`DataFra the last row in the ``right`` :class:`DataFrame` are selected where the ``on`` key is less than the left's key. Both :class:`DataFrame` must be sorted by the key. -Optionally an :func:`merge_asof` can perform a group-wise merge by matching the +Optionally :func:`merge_asof` can perform a group-wise merge by matching the ``by`` key in addition to the nearest match on the ``on`` key. .. ipython:: python diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index 29df2994fbc35..d12993f7ead4b 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -5,7 +5,7 @@ Scaling to large datasets ************************* pandas provides data structures for in-memory analytics, which makes using pandas -to analyze datasets that are larger than memory datasets somewhat tricky. Even datasets +to analyze datasets that are larger than memory somewhat tricky. Even datasets that are a sizable fraction of memory become unwieldy, as some pandas operations need to make intermediate copies. diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 4299dca4774b9..d046d13f71daf 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1580,7 +1580,7 @@ the pandas objects. ts = ts[:5] ts.shift(1) -The ``shift`` method accepts an ``freq`` argument which can accept a +The ``shift`` method accepts a ``freq`` argument which can accept a ``DateOffset`` class or other ``timedelta``-like object or also an :ref:`offset alias `. @@ -2570,7 +2570,7 @@ because daylight savings time (DST) in a local time zone causes some times to oc twice within one day ("clocks fall back"). The following options are available: * ``'raise'``: Raises a ``ValueError`` (the default behavior) -* ``'infer'``: Attempt to determine the correct offset base on the monotonicity of the timestamps +* ``'infer'``: Attempt to determine the correct offset based on the monotonicity of the timestamps * ``'NaT'``: Replaces ambiguous times with ``NaT`` * ``bool``: ``True`` represents a DST time, ``False`` represents non-DST time. An array-like of ``bool`` values is supported for a sequence of times. From e557039fda7d4325184cf76520892b3a635ec2dd Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Sat, 8 Feb 2025 11:00:13 -0800 Subject: [PATCH 46/68] BUG: Don't ignore errors when casting dtype in Series constructor (#60882) * BUG: Don't ignore errors when casting dtype in Series constructor * Add test and whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/series.py | 2 +- pandas/tests/series/test_constructors.py | 7 +++++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7ebbfd5bf75be..570faa00e97a8 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -793,6 +793,7 @@ Styler Other ^^^^^ - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) +- Bug in :class:`Series` ignoring errors when trying to convert :class:`Series` input data to the given ``dtype`` (:issue:`60728`) - Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`) - Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`) - Bug in :func:`eval` with ``engine="numexpr"`` returning unexpected result for float division. (:issue:`59736`) diff --git a/pandas/core/series.py b/pandas/core/series.py index 4fa8b86fa4c16..351622135b31f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -500,7 +500,7 @@ def __init__( # create/copy the manager if isinstance(data, SingleBlockManager): if dtype is not None: - data = data.astype(dtype=dtype, errors="ignore") + data = data.astype(dtype=dtype) elif copy: data = data.copy() else: diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 69f42b5e42878..a2be698c0ec28 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -90,6 +90,13 @@ def test_unparsable_strings_with_dt64_dtype(self): with pytest.raises(ValueError, match=msg): Series(np.array(vals, dtype=object), dtype="datetime64[ns]") + def test_invalid_dtype_conversion_datetime_to_timedelta(self): + # GH#60728 + vals = Series([NaT, Timestamp(2025, 1, 1)], dtype="datetime64[ns]") + msg = r"^Cannot cast DatetimeArray to dtype timedelta64\[ns\]$" + with pytest.raises(TypeError, match=msg): + Series(vals, dtype="timedelta64[ns]") + @pytest.mark.parametrize( "constructor", [ From 4511251ccf409f2ba71cab0283bdf751697ee539 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Mon, 10 Feb 2025 09:23:52 -0500 Subject: [PATCH 47/68] TST(string dtype): Resolve xfails in pytables (#60795) --- pandas/io/pytables.py | 3 + pandas/tests/io/pytables/test_append.py | 56 ++++++++-------- pandas/tests/io/pytables/test_categorical.py | 6 +- pandas/tests/io/pytables/test_complex.py | 6 -- pandas/tests/io/pytables/test_errors.py | 18 ++--- .../tests/io/pytables/test_file_handling.py | 10 +-- pandas/tests/io/pytables/test_keys.py | 7 +- pandas/tests/io/pytables/test_put.py | 4 +- pandas/tests/io/pytables/test_read.py | 16 +++-- pandas/tests/io/pytables/test_round_trip.py | 49 ++++++++------ pandas/tests/io/pytables/test_select.py | 44 ++++++------- pandas/tests/io/pytables/test_store.py | 66 ++++++++++--------- pandas/tests/io/pytables/test_timezones.py | 6 -- 13 files changed, 142 insertions(+), 149 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index b4c78b063c180..abad825a9a0ae 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -5118,6 +5118,9 @@ def _maybe_convert_for_string_atom( errors, columns: list[str], ): + if isinstance(bvalues.dtype, StringDtype): + # "ndarray[Any, Any]" has no attribute "to_numpy" + bvalues = bvalues.to_numpy() # type: ignore[union-attr] if bvalues.dtype != object: return bvalues diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 04241a78bff5f..55fdbf1ca2ea5 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -25,10 +25,7 @@ ensure_clean_store, ) -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] tables = pytest.importorskip("tables") @@ -40,7 +37,7 @@ def test_append(setup_path): # tables.NaturalNameWarning): df = DataFrame( np.random.default_rng(2).standard_normal((20, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=20, freq="B"), ) _maybe_remove(store, "df1") @@ -203,7 +200,7 @@ def test_append_some_nans(setup_path): tm.assert_frame_equal(store["df3"], df3, check_index_type=True) -def test_append_all_nans(setup_path): +def test_append_all_nans(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: df = DataFrame( { @@ -255,7 +252,13 @@ def test_append_all_nans(setup_path): _maybe_remove(store, "df") store.append("df", df[:10], dropna=True) store.append("df", df[10:], dropna=True) - tm.assert_frame_equal(store["df"], df, check_index_type=True) + result = store["df"] + expected = df + if using_infer_string: + # TODO: Test is incorrect when not using_infer_string. + # Should take the last 4 rows uncondiationally. + expected = expected[-4:] + tm.assert_frame_equal(result, expected, check_index_type=True) _maybe_remove(store, "df2") store.append("df2", df[:10], dropna=False) @@ -294,7 +297,7 @@ def test_append_frame_column_oriented(setup_path, request): # column oriented df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.index = df.index._with_freq(None) # freq doesn't round-trip @@ -426,7 +429,7 @@ def check_col(key, name, size): { "A": [0.0, 1.0, 2.0, 3.0, 4.0], "B": [0.0, 1.0, 0.0, 1.0, 0.0], - "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object), + "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"]), "D": date_range("20130101", periods=5), } ).set_index("C") @@ -453,7 +456,7 @@ def check_col(key, name, size): _maybe_remove(store, "df") df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" @@ -513,11 +516,12 @@ def test_append_with_empty_string(setup_path): tm.assert_frame_equal(store.select("df"), df) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_append_with_data_columns(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.iloc[0, df.columns.get_loc("B")] = 1.0 @@ -693,8 +697,8 @@ def test_append_misc(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) store.append("df", df, chunksize=1) result = store.select("df") @@ -710,8 +714,8 @@ def test_append_misc_chunksize(setup_path, chunksize): # more chunksize in append tests df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df["string"] = "foo" df["float322"] = 1.0 @@ -747,15 +751,15 @@ def test_append_misc_empty_frame(setup_path): tm.assert_frame_equal(store.select("df2"), df) -def test_append_raise(setup_path): +def test_append_raise(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: # test append with invalid input to get good error messages # list in column df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df["invalid"] = [["a"]] * len(df) assert df.dtypes["invalid"] == np.object_ @@ -775,8 +779,8 @@ def test_append_raise(setup_path): # datetime with embedded nans as object df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) s = Series(datetime.datetime(2001, 1, 2), index=df.index) s = s.astype(object) @@ -803,8 +807,8 @@ def test_append_raise(setup_path): # appending an incompatible table df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) store.append("df", df) @@ -881,7 +885,7 @@ def test_append_with_timedelta(setup_path): def test_append_to_multiple(setup_path): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) @@ -918,12 +922,12 @@ def test_append_to_multiple(setup_path): def test_append_to_multiple_dropna(setup_path): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ).rename(columns="{}_2".format) df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan @@ -943,7 +947,7 @@ def test_append_to_multiple_dropna(setup_path): def test_append_to_multiple_dropna_false(setup_path): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) diff --git a/pandas/tests/io/pytables/test_categorical.py b/pandas/tests/io/pytables/test_categorical.py index 998021bad9001..2f8c37c0b3876 100644 --- a/pandas/tests/io/pytables/test_categorical.py +++ b/pandas/tests/io/pytables/test_categorical.py @@ -16,10 +16,7 @@ ensure_clean_store, ) -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_categorical(setup_path): @@ -143,6 +140,7 @@ def test_categorical(setup_path): store.select("df3/meta/s/meta") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_categorical_conversion(tmp_path, setup_path): # GH13322 # Check that read_hdf with categorical columns doesn't return rows if diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py index d140cfc941e16..c5cac5a5caf09 100644 --- a/pandas/tests/io/pytables/test_complex.py +++ b/pandas/tests/io/pytables/test_complex.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -13,10 +11,6 @@ from pandas.io.pytables import read_hdf -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - def test_complex_fixed(tmp_path, setup_path): df = DataFrame( diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py index c31b9989ef35e..b28101c09820f 100644 --- a/pandas/tests/io/pytables/test_errors.py +++ b/pandas/tests/io/pytables/test_errors.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( CategoricalIndex, DataFrame, @@ -24,10 +22,7 @@ _maybe_adjust_name, ) -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_pass_spec_to_storer(setup_path): @@ -93,9 +88,14 @@ def test_unimplemented_dtypes_table_columns(setup_path): with ensure_clean_store(setup_path) as store: # this fails because we have a date in the object block...... - msg = re.escape( - """Cannot serialize the column [datetime1] -because its data contents are not [string] but [date] object dtype""" + msg = "|".join( + [ + re.escape( + "Cannot serialize the column [datetime1]\nbecause its data " + "contents are not [string] but [date] object dtype" + ), + re.escape("[date] is not implemented as a table column"), + ] ) with pytest.raises(TypeError, match=msg): store.append("df_unimplemented", df) diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py index 16c3c6798ff76..27b5d34146f85 100644 --- a/pandas/tests/io/pytables/test_file_handling.py +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import ( PY311, is_ci_environment, @@ -35,9 +33,7 @@ from pandas.io import pytables from pandas.io.pytables import Term -pytestmark = [ - pytest.mark.single_cpu, -] +pytestmark = [pytest.mark.single_cpu] @pytest.mark.parametrize("mode", ["r", "r+", "a", "w"]) @@ -329,7 +325,6 @@ def test_complibs(tmp_path, lvl, lib, request): assert node.filters.complib == lib -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.skipif( not is_platform_little_endian(), reason="reason platform is not little endian" ) @@ -347,7 +342,6 @@ def test_encoding(setup_path): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "val", [ @@ -362,7 +356,7 @@ def test_encoding(setup_path): [b"A\xf8\xfc", np.nan, b"", b"b", b"c"], ], ) -@pytest.mark.parametrize("dtype", ["category", object]) +@pytest.mark.parametrize("dtype", ["category", None]) def test_latin_encoding(tmp_path, setup_path, dtype, val): enc = "latin-1" nan_rep = "" diff --git a/pandas/tests/io/pytables/test_keys.py b/pandas/tests/io/pytables/test_keys.py index 7d0802dcf2e47..9c5fc8786c7c6 100644 --- a/pandas/tests/io/pytables/test_keys.py +++ b/pandas/tests/io/pytables/test_keys.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, HDFStore, @@ -15,10 +13,7 @@ tables, ) -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_keys(setup_path): diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index 66596f1138b96..c9fe6070b34c3 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -22,9 +22,7 @@ ) from pandas.util import _test_decorators as td -pytestmark = [ - pytest.mark.single_cpu, -] +pytestmark = [pytest.mark.single_cpu] def test_format_type(tmp_path, setup_path): diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index 8ae87d4bab52d..ed4f523a21b1e 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -26,10 +26,7 @@ from pandas.io.pytables import TableIterator -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_read_missing_key_close_store(tmp_path, setup_path): @@ -75,10 +72,11 @@ def test_read_missing_key_opened_store(tmp_path, setup_path): read_hdf(store, "k1") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_read_column(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -175,7 +173,7 @@ def test_pytables_native2_read(datapath): assert isinstance(d1, DataFrame) -def test_read_hdf_open_store(tmp_path, setup_path): +def test_read_hdf_open_store(tmp_path, setup_path, using_infer_string): # GH10330 # No check for non-string path_or-buf, and no test of open store df = DataFrame( @@ -187,6 +185,12 @@ def test_read_hdf_open_store(tmp_path, setup_path): df = df.set_index(keys="E", append=True) path = tmp_path / setup_path + if using_infer_string: + # TODO(infer_string) make this work for string dtype + msg = "Saving a MultiIndex with an extension dtype is not supported." + with pytest.raises(NotImplementedError, match=msg): + df.to_hdf(path, key="df", mode="w") + return df.to_hdf(path, key="df", mode="w") direct = read_hdf(path, "df") with HDFStore(path, mode="r") as store: diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 875a792467828..409b92d2ddde1 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import Timestamp from pandas.compat import is_platform_windows @@ -26,10 +24,7 @@ ) from pandas.util import _test_decorators as td -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_conv_read_write(): @@ -49,8 +44,8 @@ def roundtrip(key, obj, **kwargs): o = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) tm.assert_frame_equal(o, roundtrip("frame", o)) @@ -150,8 +145,8 @@ def test_api_invalid(tmp_path, setup_path): # Invalid. df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) msg = "Can only append to Tables" @@ -201,7 +196,7 @@ def test_put_integer(setup_path): _check_roundtrip(df, tm.assert_frame_equal, setup_path) -def test_table_values_dtypes_roundtrip(setup_path): +def test_table_values_dtypes_roundtrip(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: df1 = DataFrame({"a": [1, 2, 3]}, dtype="f8") store.append("df_f8", df1) @@ -246,6 +241,7 @@ def test_table_values_dtypes_roundtrip(setup_path): store.append("df_mixed_dtypes1", df1) result = store.select("df_mixed_dtypes1").dtypes.value_counts() result.index = [str(i) for i in result.index] + str_dtype = "str" if using_infer_string else "object" expected = Series( { "float32": 2, @@ -255,7 +251,7 @@ def test_table_values_dtypes_roundtrip(setup_path): "int16": 1, "int8": 1, "int64": 1, - "object": 1, + str_dtype: 1, "datetime64[s]": 2, "datetime64[ms]": 1, "datetime64[ns]": 1, @@ -277,10 +273,10 @@ def test_series(setup_path): ) _check_roundtrip(ts, tm.assert_series_equal, path=setup_path) - ts2 = Series(ts.index, Index(ts.index, dtype=object)) + ts2 = Series(ts.index, Index(ts.index)) _check_roundtrip(ts2, tm.assert_series_equal, path=setup_path) - ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) + ts3 = Series(ts.values, Index(np.asarray(ts.index))) _check_roundtrip( ts3, tm.assert_series_equal, path=setup_path, check_index_type=False ) @@ -370,8 +366,8 @@ def test_timeseries_preepoch(setup_path, request): def test_frame(compression, setup_path): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) # put in some random NAs @@ -387,7 +383,7 @@ def test_frame(compression, setup_path): tdf = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) _check_roundtrip( @@ -402,7 +398,10 @@ def test_frame(compression, setup_path): assert recons._mgr.is_consolidated() # empty - _check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path) + df2 = df[:0] + # Prevent df2 from having index with inferred_type as string + df2.index = Index([]) + _check_roundtrip(df2[:0], tm.assert_frame_equal, path=setup_path) def test_empty_series_frame(setup_path): @@ -434,9 +433,17 @@ def test_can_serialize_dates(setup_path): _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) -def test_store_hierarchical(setup_path, multiindex_dataframe_random_data): +def test_store_hierarchical( + setup_path, using_infer_string, multiindex_dataframe_random_data +): frame = multiindex_dataframe_random_data + if using_infer_string: + # TODO(infer_string) make this work for string dtype + msg = "Saving a MultiIndex with an extension dtype is not supported." + with pytest.raises(NotImplementedError, match=msg): + _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) + return _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) _check_roundtrip(frame.T, tm.assert_frame_equal, path=setup_path) _check_roundtrip(frame["A"], tm.assert_series_equal, path=setup_path) @@ -455,8 +462,8 @@ def test_store_mixed(compression, setup_path): def _make_one(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df["obj1"] = "foo" df["obj2"] = "bar" diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py index 4b20b929ef447..28af76f561356 100644 --- a/pandas/tests/io/pytables/test_select.py +++ b/pandas/tests/io/pytables/test_select.py @@ -27,10 +27,7 @@ from pandas.io.pytables import Term -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] def test_select_columns_in_where(setup_path): @@ -138,7 +135,7 @@ def test_select(setup_path): # select with columns= df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) _maybe_remove(store, "df") @@ -278,8 +275,8 @@ def test_select_dtypes(setup_path, request): with ensure_clean_store(setup_path) as store: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) expected = df[df["A"] > 0] @@ -350,7 +347,7 @@ def test_select_iterator(tmp_path, setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) _maybe_remove(store, "df") @@ -375,7 +372,7 @@ def test_select_iterator(tmp_path, setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.to_hdf(path, key="df_non_table") @@ -391,7 +388,7 @@ def test_select_iterator(tmp_path, setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df.to_hdf(path, key="df", format="table") @@ -408,7 +405,7 @@ def test_select_iterator(tmp_path, setup_path): with ensure_clean_store(setup_path) as store: df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) store.append("df1", df1, data_columns=True) @@ -436,7 +433,7 @@ def test_select_iterator_complete_8014(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -471,7 +468,7 @@ def test_select_iterator_complete_8014(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -513,7 +510,7 @@ def test_select_iterator_non_complete_8014(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -547,7 +544,7 @@ def test_select_iterator_non_complete_8014(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -571,7 +568,7 @@ def test_select_iterator_many_empty_frames(setup_path): with ensure_clean_store(setup_path) as store: expected = DataFrame( np.random.default_rng(2).standard_normal((100064, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100064, freq="s"), ) _maybe_remove(store, "df") @@ -623,7 +620,7 @@ def test_select_iterator_many_empty_frames(setup_path): def test_frame_select(setup_path, request): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -655,7 +652,7 @@ def test_frame_select(setup_path, request): # invalid terms df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) store.append("df_time", df) @@ -669,12 +666,13 @@ def test_frame_select(setup_path, request): # store.select('frame', [crit1, crit2]) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_frame_select_complex(setup_path): # select via complex criteria df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" @@ -791,7 +789,7 @@ def test_invalid_filtering(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -813,7 +811,7 @@ def test_string_select(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -857,7 +855,7 @@ def test_string_select(setup_path): def test_select_as_multiple(setup_path): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) @@ -982,6 +980,7 @@ def test_query_long_float_literal(setup_path): tm.assert_frame_equal(expected, result) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_query_compare_column_type(setup_path): # GH 15492 df = DataFrame( @@ -1058,7 +1057,6 @@ def test_select_large_integer(tmp_path): ), columns=["x", "y"], ) - result = None with HDFStore(path) as s: s.append("data", df, data_columns=True, index=False) result = s.select("data", where="y==-9223372036854775801").get("y").get(0) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 2bfe9e33a6235..bb2058c050f2a 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -25,6 +25,7 @@ timedelta_range, ) import pandas._testing as tm +from pandas.conftest import has_pyarrow from pandas.tests.io.pytables.common import ( _maybe_remove, ensure_clean_store, @@ -35,10 +36,7 @@ read_hdf, ) -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = [pytest.mark.single_cpu] tables = pytest.importorskip("tables") @@ -110,7 +108,7 @@ def test_iter_empty(setup_path): assert list(store) == [] -def test_repr(setup_path, performance_warning): +def test_repr(setup_path, performance_warning, using_infer_string): with ensure_clean_store(setup_path) as store: repr(store) store.info() @@ -145,7 +143,9 @@ def test_repr(setup_path, performance_warning): df.loc[df.index[3:6], ["obj1"]] = np.nan df = df._consolidate() - with tm.assert_produces_warning(performance_warning): + warning = None if using_infer_string else performance_warning + msg = "cannot\nmap directly to c-types .* dtype='object'" + with tm.assert_produces_warning(warning, match=msg): store["df"] = df # make a random group in hdf space @@ -316,7 +316,7 @@ def test_getattr(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) store["df"] = df @@ -369,7 +369,7 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path): { "A": [0.0, 1.0, 2.0, 3.0, 4.0], "B": [0.0, 1.0, 0.0, 1.0, 0.0], - "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object), + "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"]), "D": date_range("20130101", periods=5), } ).set_index("C") @@ -385,6 +385,10 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path): tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]])) +@pytest.mark.xfail( + using_string_dtype() and has_pyarrow, + reason="TODO(infer_string): can't encode '\ud800': surrogates not allowed", +) @pytest.mark.parametrize("format", ["fixed", "table"]) def test_to_hdf_errors(tmp_path, format, setup_path): data = ["\ud800foo"] @@ -406,7 +410,7 @@ def col(t, column): # data columns df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" @@ -441,7 +445,7 @@ def col(t, column): # data columns df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df["string"] = "foo" @@ -483,8 +487,8 @@ def test_table_mixed_dtypes(setup_path): # frame df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df["obj1"] = "foo" df["obj2"] = "bar" @@ -539,8 +543,8 @@ def test_remove(setup_path): ) df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) store["a"] = ts store["b"] = df @@ -603,8 +607,8 @@ def test_same_name_scoping(setup_path): def test_store_index_name(setup_path): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df.index.name = "foo" @@ -650,8 +654,8 @@ def test_store_index_name_numpy_str(tmp_path, table_format, setup_path, unit, tz def test_store_series_name(setup_path): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) series = df["A"] @@ -665,7 +669,7 @@ def test_overwrite_node(setup_path): with ensure_clean_store(setup_path) as store: store["a"] = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) ts = Series( @@ -679,7 +683,7 @@ def test_overwrite_node(setup_path): def test_coordinates(setup_path): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) @@ -714,7 +718,7 @@ def test_coordinates(setup_path): _maybe_remove(store, "df2") df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) df2 = df1.copy().rename(columns="{}_2".format) @@ -870,8 +874,8 @@ def test_start_stop_fixed(setup_path): # sparse; not implemented df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) df.iloc[3:5, 1:3] = np.nan df.iloc[8:10, -2] = np.nan @@ -904,8 +908,8 @@ def test_select_filter_corner(setup_path, request): def test_path_pathlib(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) result = tm.round_trip_pathlib( @@ -934,8 +938,8 @@ def test_contiguous_mixed_data_table(start, stop, setup_path): def test_path_pathlib_hdfstore(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) def writer(path): @@ -953,8 +957,8 @@ def reader(path): def test_pickle_path_localpath(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) result = tm.round_trip_pathlib( lambda p: df.to_hdf(p, key="df"), lambda p: read_hdf(p, "df") @@ -966,8 +970,8 @@ def test_pickle_path_localpath(): def test_copy(propindexes): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 8f179f844e4d0..9192804e49bd1 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs.timezones import maybe_get_tz import pandas.util._test_decorators as td @@ -25,10 +23,6 @@ ensure_clean_store, ) -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - def _compare_with_tz(a, b): tm.assert_frame_equal(a, b) From d589aba2fc0227f59a9a99404ba50869ed2e5355 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 10 Feb 2025 23:47:35 +0530 Subject: [PATCH 48/68] DOC: fix ES01 for pandas.DataFrame.set_flags and pandas.Series.set_flags (#60891) --- pandas/core/generic.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f376518d4d3b8..874ab1a3c944d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -406,6 +406,12 @@ def set_flags( """ Return a new object with updated flags. + This method creates a shallow copy of the original object, preserving its + underlying data while modifying its global flags. In particular, it allows + you to update properties such as whether duplicate labels are permitted. This + behavior is especially useful in method chains, where one wishes to + adjust DataFrame or Series characteristics without altering the original object. + Parameters ---------- copy : bool, default False From a65e25d12937e4684452b20f58174bdd0cf77e7f Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 10 Feb 2025 23:48:55 +0530 Subject: [PATCH 49/68] DOC: fix ES01 for pandas.api.types.infer_dtype (#60904) --- pandas/_libs/lib.pyx | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5239aa2c61dc5..fce51700d623f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1522,6 +1522,11 @@ def infer_dtype(value: object, skipna: bool = True) -> str: """ Return a string label of the type of a scalar or list-like of values. + This method inspects the elements of the provided input and determines + classification of its data type. It is particularly useful for + handling heterogeneous data inputs where explicit dtype conversion may not + be possible or necessary. + Parameters ---------- value : scalar, list, ndarray, or pandas type From ea3ff69042219a08adea3f26f78f9f7a9bd208f9 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 10 Feb 2025 23:49:30 +0530 Subject: [PATCH 50/68] DOC: fix ES01 for pandas.CategoricalDtype (#60902) * DOC: fix ES01 for is a dtype representation for categorical data, which allows users to define a fixed set of values and optionally impose an ordering. This is particularly useful for handling categorical variables efficiently, as it can significantly reduce memory usage compared to using object dtypes. * DOC: fix ES01 for pandas.CategoricalDtype --- pandas/core/dtypes/dtypes.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index d8dd6441913b5..6be6787862654 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -161,6 +161,11 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): """ Type for categorical data with the categories and orderedness. + It is a dtype representation for categorical data, which allows users to define + a fixed set of values and optionally impose an ordering. This is particularly + useful for handling categorical variables efficiently, as it can significantly + reduce memory usage compared to using object dtypes. + Parameters ---------- categories : sequence, optional From 859b8730a97b9d5a91e050609b1701bc1ad6f3cb Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 10 Feb 2025 23:50:13 +0530 Subject: [PATCH 51/68] DOC: fix ES01 for pandas.DataFrame.sparse (#60892) --- pandas/core/arrays/sparse/accessor.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 0ed5f69fe4703..eab8527eef526 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -279,6 +279,11 @@ class SparseFrameAccessor(BaseAccessor, PandasDelegate): """ DataFrame accessor for sparse data. + It allows users to interact with a `DataFrame` that contains sparse data types + (`SparseDtype`). It provides methods and attributes to efficiently work with sparse + storage, reducing memory usage while maintaining compatibility with standard pandas + operations. + Parameters ---------- data : scipy.sparse.spmatrix From a9a3047b6091114254250fa1f9120f4f925756cd Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 10 Feb 2025 23:50:57 +0530 Subject: [PATCH 52/68] DOC: fix ES01 for pandas.read_json (#60900) --- pandas/io/json/_json.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 703a2b3656c9c..e032e26d771d7 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -520,6 +520,12 @@ def read_json( """ Convert a JSON string to pandas object. + This method reads JSON files or JSON-like data and converts them into pandas + objects. It supports a variety of input formats, including line-delimited JSON, + compressed files, and various data representations (table, records, index-based, + etc.). When `chunksize` is specified, an iterator is returned instead of loading + the entire data into memory. + Parameters ---------- path_or_buf : a str path, path object or file-like object From 4cccb7353786e7e9850c52ade00d66862c8e2e8a Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 10 Feb 2025 23:51:39 +0530 Subject: [PATCH 53/68] DOC: fix ES01 for pandas.HDFStore.put (#60899) --- pandas/io/pytables.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index abad825a9a0ae..5cedb41fdcb22 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1131,6 +1131,12 @@ def put( """ Store object in HDFStore. + This method writes a pandas DataFrame or Series into an HDF5 file using + either the fixed or table format. The `table` format allows additional + operations like incremental appends and queries but may have performance + trade-offs. The `fixed` format provides faster read/write operations but + does not support appends or queries. + Parameters ---------- key : str From d14f7cf535a665f668f43fafc3fbeca6d28c4d11 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 10 Feb 2025 23:52:12 +0530 Subject: [PATCH 54/68] DOC: fix ES01 for pandas.io.formats.style.Styler (#60893) --- pandas/io/formats/style.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index f2ec41d2c6a43..c9bea58751207 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -117,6 +117,12 @@ class Styler(StylerRenderer): r""" Helps style a DataFrame or Series according to the data with HTML and CSS. + This class provides methods for styling and formatting a Pandas DataFrame or Series. + The styled output can be rendered as HTML or LaTeX, and it supports CSS-based + styling, allowing users to control colors, font styles, and other visual aspects of + tabular data. It is particularly useful for presenting DataFrame objects in a + Jupyter Notebook environment or when exporting styled tables for reports and + Parameters ---------- data : Series or DataFrame From 8f802cd01fd7cf83624d13e45a65c11dd2d221e2 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Mon, 10 Feb 2025 10:23:27 -0800 Subject: [PATCH 55/68] =?UTF-8?q?BUG:=20Fix=20bug=20in=20DataFrame=20binar?= =?UTF-8?q?y=20op=20not=20respecting=20fill=5Fvalue=20in=20case=E2=80=A6?= =?UTF-8?q?=20(#60906)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BUG: Fix bug in DataFrame binary op not respecting fill_value in case of MultiIndex columns --- pandas/core/frame.py | 11 ++++++++--- pandas/tests/frame/test_arithmetic.py | 20 ++++++++++++++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 57a7b9467a05e..3199733cfb85f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8029,10 +8029,15 @@ def _should_reindex_frame_op(self, right, op, axis: int, fill_value, level) -> b return False if ( - isinstance(self.columns, MultiIndex) - or isinstance(right.columns, MultiIndex) - ) and not self.columns.equals(right.columns): + ( + isinstance(self.columns, MultiIndex) + or isinstance(right.columns, MultiIndex) + ) + and not self.columns.equals(right.columns) + and fill_value is None + ): # GH#60498 Reindex if MultiIndexe columns are not matching + # GH#60903 Don't reindex if fill_value is provided return True if fill_value is None and level is None and axis == 1: diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index aa2d5e9d23815..8239de3f39c20 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -2058,6 +2058,26 @@ def test_arithmetic_multiindex_column_align(): tm.assert_frame_equal(result, expected) +def test_arithmetic_multiindex_column_align_with_fillvalue(): + # GH#60903 + df1 = DataFrame( + data=[[1.0, 2.0]], + columns=MultiIndex.from_tuples([("A", "one"), ("A", "two")]), + ) + df2 = DataFrame( + data=[[3.0, 4.0]], + columns=MultiIndex.from_tuples([("B", "one"), ("B", "two")]), + ) + expected = DataFrame( + data=[[1.0, 2.0, 3.0, 4.0]], + columns=MultiIndex.from_tuples( + [("A", "one"), ("A", "two"), ("B", "one"), ("B", "two")] + ), + ) + result = df1.add(df2, fill_value=0) + tm.assert_frame_equal(result, expected) + + def test_bool_frame_mult_float(): # GH 18549 df = DataFrame(True, list("ab"), list("cd")) From 11e3dc2738befa5803aea676f1b68082a8b24a8b Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Mon, 10 Feb 2025 10:24:53 -0800 Subject: [PATCH 56/68] PERF: Fix groupby skipna performance (#60871) --- pandas/_libs/groupby.pyx | 97 +++++++++++++++++--------- pandas/core/_numba/kernels/min_max_.py | 2 +- 2 files changed, 64 insertions(+), 35 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 16a104a46ed3d..f65fa2368967a 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -753,16 +753,20 @@ def group_sum( if uses_mask: isna_entry = mask[i, j] - isna_result = result_mask[lab, j] else: isna_entry = _treat_as_na(val, is_datetimelike) - isna_result = _treat_as_na(sumx[lab, j], is_datetimelike) - if not skipna and isna_result: - # If sum is already NA, don't add to it. This is important for - # datetimelikebecause adding a value to NPY_NAT may not result - # in a NPY_NAT - continue + if not skipna: + if uses_mask: + isna_result = result_mask[lab, j] + else: + isna_result = _treat_as_na(sumx[lab, j], is_datetimelike) + + if isna_result: + # If sum is already NA, don't add to it. This is important for + # datetimelikebecause adding a value to NPY_NAT may not result + # in a NPY_NAT + continue if not isna_entry: nobs[lab, j] += 1 @@ -845,14 +849,18 @@ def group_prod( if uses_mask: isna_entry = mask[i, j] - isna_result = result_mask[lab, j] else: isna_entry = _treat_as_na(val, False) - isna_result = _treat_as_na(prodx[lab, j], False) - if not skipna and isna_result: - # If prod is already NA, no need to update it - continue + if not skipna: + if uses_mask: + isna_result = result_mask[lab, j] + else: + isna_result = _treat_as_na(prodx[lab, j], False) + + if isna_result: + # If prod is already NA, no need to update it + continue if not isna_entry: nobs[lab, j] += 1 @@ -919,22 +927,30 @@ def group_var( if uses_mask: isna_entry = mask[i, j] - isna_result = result_mask[lab, j] elif is_datetimelike: # With group_var, we cannot just use _treat_as_na bc # datetimelike dtypes get cast to float64 instead of # to int64. isna_entry = val == NPY_NAT - isna_result = out[lab, j] == NPY_NAT else: isna_entry = _treat_as_na(val, is_datetimelike) - isna_result = _treat_as_na(out[lab, j], is_datetimelike) - if not skipna and isna_result: - # If aggregate is already NA, don't add to it. This is important for - # datetimelike because adding a value to NPY_NAT may not result - # in a NPY_NAT - continue + if not skipna: + if uses_mask: + isna_result = result_mask[lab, j] + elif is_datetimelike: + # With group_var, we cannot just use _treat_as_na bc + # datetimelike dtypes get cast to float64 instead of + # to int64. + isna_result = out[lab, j] == NPY_NAT + else: + isna_result = _treat_as_na(out[lab, j], is_datetimelike) + + if isna_result: + # If aggregate is already NA, don't add to it. This is + # important for datetimelike because adding a value to NPY_NAT + # may not result in a NPY_NAT + continue if not isna_entry: nobs[lab, j] += 1 @@ -1232,22 +1248,30 @@ def group_mean( if uses_mask: isna_entry = mask[i, j] - isna_result = result_mask[lab, j] elif is_datetimelike: # With group_mean, we cannot just use _treat_as_na bc # datetimelike dtypes get cast to float64 instead of # to int64. isna_entry = val == NPY_NAT - isna_result = sumx[lab, j] == NPY_NAT else: isna_entry = _treat_as_na(val, is_datetimelike) - isna_result = _treat_as_na(sumx[lab, j], is_datetimelike) - if not skipna and isna_result: - # If sum is already NA, don't add to it. This is important for - # datetimelike because adding a value to NPY_NAT may not result - # in NPY_NAT - continue + if not skipna: + if uses_mask: + isna_result = result_mask[lab, j] + elif is_datetimelike: + # With group_mean, we cannot just use _treat_as_na bc + # datetimelike dtypes get cast to float64 instead of + # to int64. + isna_result = sumx[lab, j] == NPY_NAT + else: + isna_result = _treat_as_na(sumx[lab, j], is_datetimelike) + + if isna_result: + # If sum is already NA, don't add to it. This is important for + # datetimelike because adding a value to NPY_NAT may not result + # in NPY_NAT + continue if not isna_entry: nobs[lab, j] += 1 @@ -1909,15 +1933,20 @@ cdef group_min_max( if uses_mask: isna_entry = mask[i, j] - isna_result = result_mask[lab, j] else: isna_entry = _treat_as_na(val, is_datetimelike) - isna_result = _treat_as_na(group_min_or_max[lab, j], - is_datetimelike) - if not skipna and isna_result: - # If current min/max is already NA, it will always be NA - continue + if not skipna: + if uses_mask: + isna_result = result_mask[lab, j] + else: + isna_result = _treat_as_na( + group_min_or_max[lab, j], is_datetimelike + ) + + if isna_result: + # If current min/max is already NA, it will always be NA + continue if not isna_entry: nobs[lab, j] += 1 diff --git a/pandas/core/_numba/kernels/min_max_.py b/pandas/core/_numba/kernels/min_max_.py index d56453e4e5abf..68aa1446bbe3c 100644 --- a/pandas/core/_numba/kernels/min_max_.py +++ b/pandas/core/_numba/kernels/min_max_.py @@ -98,7 +98,7 @@ def grouped_min_max( for i in range(N): lab = labels[i] val = values[i] - if lab < 0 or (nobs[lab] >= 1 and np.isnan(output[lab])): + if lab < 0 or (not skipna and nobs[lab] >= 1 and np.isnan(output[lab])): continue if values.dtype.kind == "i" or not np.isnan(val): From 5e5da07104e710137bc73d9c0bf42078260e1b1c Mon Sep 17 00:00:00 2001 From: William Andrea <22385371+wjandrea@users.noreply.github.com> Date: Mon, 10 Feb 2025 14:25:51 -0400 Subject: [PATCH 57/68] BUG: Fix frozenset display in pprint (#60867) * Fix frozenset display in pprint It needs to have braces like a set. * Fix frozenset pprint test --- pandas/io/formats/printing.py | 2 +- pandas/tests/io/formats/test_printing.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 5a52ee78cb9be..ab27321ffe83c 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -112,7 +112,7 @@ def _pprint_seq( if isinstance(seq, set): fmt = "{{{body}}}" elif isinstance(seq, frozenset): - fmt = "frozenset({body})" + fmt = "frozenset({{{body}}})" else: fmt = "[{body}]" if hasattr(seq, "__setitem__") else "({body})" diff --git a/pandas/tests/io/formats/test_printing.py b/pandas/tests/io/formats/test_printing.py index f86b4af2647f8..7d154235d2c4a 100644 --- a/pandas/tests/io/formats/test_printing.py +++ b/pandas/tests/io/formats/test_printing.py @@ -83,7 +83,7 @@ def test_repr_mapping(self): assert printing.pprint_thing(MyMapping()) == "{'a': 4, 'b': 4}" def test_repr_frozenset(self): - assert printing.pprint_thing(frozenset([1, 2])) == "frozenset(1, 2)" + assert printing.pprint_thing(frozenset([1, 2])) == "frozenset({1, 2})" class TestFormatBase: From 02de8140251096386cbefab0186d45af0c3d8ebd Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 10 Feb 2025 10:26:28 -0800 Subject: [PATCH 58/68] STY: Enable shellcheck pre-commit hook (#60817) --- .pre-commit-config.yaml | 5 +++++ ci/code_checks.sh | 18 +++++++++--------- ci/run_tests.sh | 8 +++----- ci/upload_wheels.sh | 9 +++++---- scripts/cibw_before_build.sh | 1 + scripts/cibw_before_build_windows.sh | 1 + scripts/cibw_before_test_windows.sh | 1 + scripts/download_wheels.sh | 17 +++++++++-------- 8 files changed, 34 insertions(+), 26 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 77bcadf57dd2d..c13c38b20a7f8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -106,6 +106,11 @@ repos: hooks: - id: meson-fmt args: ['--inplace'] +- repo: https://github.com/shellcheck-py/shellcheck-py + rev: v0.10.0.1 + hooks: + - id: shellcheck + args: ["--severity=warning"] - repo: local hooks: - id: pyright diff --git a/ci/code_checks.sh b/ci/code_checks.sh index ee5b7eb4f09fb..5782b2b171e07 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -24,15 +24,15 @@ else fi [[ -z "$CHECK" || "$CHECK" == "code" || "$CHECK" == "doctests" || "$CHECK" == "docstrings" || "$CHECK" == "single-docs" || "$CHECK" == "notebooks" ]] || \ - { echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 9999; } + { echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 1; } -BASE_DIR="$(dirname $0)/.." +BASE_DIR="$(dirname "$0")/.." RET=0 ### CODE ### if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then - MSG='Check import. No warnings, and blocklist some optional dependencies' ; echo $MSG + MSG='Check import. No warnings, and blocklist some optional dependencies' ; echo "$MSG" python -W error -c " import sys import pandas @@ -49,24 +49,24 @@ if mods: sys.stderr.write('err: pandas should not import: {}\n'.format(', '.join(mods))) sys.exit(len(mods)) " - RET=$(($RET + $?)) ; echo $MSG "DONE" + RET=$(($RET + $?)) ; echo "$MSG" "DONE" fi ### DOCTESTS ### if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then - MSG='Python and Cython Doctests' ; echo $MSG + MSG='Python and Cython Doctests' ; echo "$MSG" python -c 'import pandas as pd; pd.test(run_doctests=True)' - RET=$(($RET + $?)) ; echo $MSG "DONE" + RET=$(($RET + $?)) ; echo "$MSG" "DONE" fi ### DOCSTRINGS ### if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then - MSG='Validate Docstrings' ; echo $MSG - $BASE_DIR/scripts/validate_docstrings.py \ + MSG='Validate Docstrings' ; echo "$MSG" + "$BASE_DIR"/scripts/validate_docstrings.py \ --format=actions \ -i ES01 `# For now it is ok if docstrings are missing the extended summary` \ -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ @@ -265,7 +265,7 @@ fi if [[ -z "$CHECK" || "$CHECK" == "notebooks" ]]; then MSG='Notebooks' ; echo $MSG - jupyter nbconvert --execute $(find doc/source -name '*.ipynb') --to notebook + jupyter nbconvert --execute "$(find doc/source -name '*.ipynb')" --to notebook RET=$(($RET + $?)) ; echo $MSG "DONE" fi diff --git a/ci/run_tests.sh b/ci/run_tests.sh index d2c2f58427a23..16292beec612b 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -3,10 +3,8 @@ # Workaround for pytest-xdist (it collects different tests in the workers if PYTHONHASHSEED is not set) # https://github.com/pytest-dev/pytest/issues/920 # https://github.com/pytest-dev/pytest/issues/1075 -export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') - -# May help reproduce flaky CI builds if set in subsequent runs -echo PYTHONHASHSEED=$PYTHONHASHSEED +PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') +export PYTHONHASHSEED COVERAGE="-s --cov=pandas --cov-report=xml --cov-append --cov-config=pyproject.toml" @@ -16,5 +14,5 @@ if [[ "$PATTERN" ]]; then PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" fi -echo $PYTEST_CMD +echo "$PYTEST_CMD" sh -c "$PYTEST_CMD" diff --git a/ci/upload_wheels.sh b/ci/upload_wheels.sh index 3c4aa76c02003..c7c7ca00ee466 100644 --- a/ci/upload_wheels.sh +++ b/ci/upload_wheels.sh @@ -1,3 +1,4 @@ +#!/bin/bash # Modified from numpy's https://github.com/numpy/numpy/blob/main/tools/wheels/upload_wheels.sh set_upload_vars() { @@ -19,20 +20,20 @@ set_upload_vars() { fi } upload_wheels() { - echo ${PWD} + echo "${PWD}" if [[ ${ANACONDA_UPLOAD} == true ]]; then - if [ -z ${TOKEN} ]; then + if [ -z "${TOKEN}" ]; then echo no token set, not uploading else # sdists are located under dist folder when built through setup.py if compgen -G "./dist/*.gz"; then echo "Found sdist" - anaconda -q -t ${TOKEN} upload --skip -u ${ANACONDA_ORG} ./dist/*.gz + anaconda -q -t "${TOKEN}" upload --skip -u "${ANACONDA_ORG}" ./dist/*.gz echo "Uploaded sdist" fi if compgen -G "./wheelhouse/*.whl"; then echo "Found wheel" - anaconda -q -t ${TOKEN} upload --skip -u ${ANACONDA_ORG} ./wheelhouse/*.whl + anaconda -q -t "${TOKEN}" upload --skip -u "${ANACONDA_ORG}" ./wheelhouse/*.whl echo "Uploaded wheel" fi echo "PyPI-style index: https://pypi.anaconda.org/$ANACONDA_ORG/simple" diff --git a/scripts/cibw_before_build.sh b/scripts/cibw_before_build.sh index 4cdbf8db0ba89..d326dd3637314 100644 --- a/scripts/cibw_before_build.sh +++ b/scripts/cibw_before_build.sh @@ -1,3 +1,4 @@ +#!/bin/bash # Add 3rd party licenses, like numpy does for file in $PACKAGE_DIR/LICENSES/*; do cat $file >> $PACKAGE_DIR/LICENSE diff --git a/scripts/cibw_before_build_windows.sh b/scripts/cibw_before_build_windows.sh index 5153ebd691f3b..f9e1e68d8efba 100644 --- a/scripts/cibw_before_build_windows.sh +++ b/scripts/cibw_before_build_windows.sh @@ -1,3 +1,4 @@ +#!/bin/bash # Add 3rd party licenses, like numpy does for file in $PACKAGE_DIR/LICENSES/*; do cat $file >> $PACKAGE_DIR/LICENSE diff --git a/scripts/cibw_before_test_windows.sh b/scripts/cibw_before_test_windows.sh index dd02bc23dd5a1..8878e3950452f 100644 --- a/scripts/cibw_before_test_windows.sh +++ b/scripts/cibw_before_test_windows.sh @@ -1,3 +1,4 @@ +#!/bin/bash # TODO: Delete when there's a NumPy Windows wheel for the free-threaded build on PyPI. FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")" if [[ $FREE_THREADED_BUILD == "True" ]]; then diff --git a/scripts/download_wheels.sh b/scripts/download_wheels.sh index 84279ac7a04d1..3dcae0cadcdcf 100755 --- a/scripts/download_wheels.sh +++ b/scripts/download_wheels.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # # Download all wheels for a pandas version. # @@ -11,11 +11,12 @@ # one by one to the dist/ directory where they would be generated. VERSION=$1 -mkdir -p $(dirname -- $0)/../dist -DIST_DIR="$(realpath $(dirname -- $0)/../dist)" +BASE_DIR=$(dirname -- $0) +mkdir -p $BASE_DIR/../dist +DIST_DIR="$(realpath $BASE_DIR/../dist)" -if [ -z $VERSION ]; then - printf "Usage:\n\t$0 \n\nWhere is for example 1.5.3" +if [ -z "$VERSION" ]; then + printf "Usage:\n\t%s \n\nWhere is for example 1.5.3" "$0" exit 1 fi @@ -23,7 +24,7 @@ curl "https://anaconda.org/multibuild-wheels-staging/pandas/files?version=${VERS grep "href=\"/multibuild-wheels-staging/pandas/${VERSION}" | \ sed -r 's/.*.*/\1/g' | \ awk '{print "https://anaconda.org" $0 }' | \ - xargs wget -P $DIST_DIR + xargs wget -P "$DIST_DIR" -printf "\nWheels downloaded to $DIST_DIR\nYou can upload them to PyPI using:\n\n" -printf "\ttwine upload ${DIST_DIR}/pandas-${VERSION}*.{whl,tar.gz} --skip-existing" +printf '\nWheels downloaded to %s\nYou can upload them to PyPI using:\n\n' "$DIST_DIR" +printf "\ttwine upload %s/pandas-%s*.{whl,tar.gz} --skip-existing" "$DIST_DIR" "$VERSION" From 05de25381f71657bd425d2c4045d81a46b2d3740 Mon Sep 17 00:00:00 2001 From: Rafael Fontenelle Date: Tue, 11 Feb 2025 14:08:50 -0300 Subject: [PATCH 59/68] Uniformize date format in index.html (#60912) --- web/pandas/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/pandas/index.html b/web/pandas/index.html index 98628b856edb6..e8aab9e11144c 100644 --- a/web/pandas/index.html +++ b/web/pandas/index.html @@ -121,7 +121,7 @@

Previous versions

    {% for release in releases[5:] %}
  • - {{ release.name }} ({{ release.published.strftime("%Y-%m-%d") }})
    + {{ release.name }} ({{ release.published.strftime("%b %d, %Y") }})
    changelog | docs | code From eb32372f37f378f63b1a7400b0e673ea6cf5a2f9 Mon Sep 17 00:00:00 2001 From: "Christine P. Chai" Date: Wed, 12 Feb 2025 09:58:57 -0800 Subject: [PATCH 60/68] DOC: Update the Numba jit links in window.rst (#60917) --- doc/source/user_guide/window.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst index 0581951d5bfad..406d77d5b8caa 100644 --- a/doc/source/user_guide/window.rst +++ b/doc/source/user_guide/window.rst @@ -356,11 +356,11 @@ See :ref:`enhancing performance with Numba ` for general us Numba will be applied in potentially two routines: -#. If ``func`` is a standard Python function, the engine will `JIT `__ the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again. +#. If ``func`` is a standard Python function, the engine will `JIT `__ the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again. #. The engine will JIT the for loop where the apply function is applied to each window. The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the -`numba.jit decorator `__. +`numba.jit decorator `__. These keyword arguments will be applied to *both* the passed function (if a standard Python function) and the apply for loop over each window. From e3f544dc4f90e88e441a9f47e657ddc5564085e5 Mon Sep 17 00:00:00 2001 From: Rafael Fontenelle Date: Wed, 12 Feb 2025 14:59:46 -0300 Subject: [PATCH 61/68] Add space after inline code and lower-case pandas in ecosystem.md (#60918) --- web/pandas/community/ecosystem.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 876e6e5b298ea..c0bd28d021f34 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -205,7 +205,7 @@ standard output formats (HTML, HTML presentation slides, LaTeX, PDF, ReStructuredText, Markdown, Python) through 'Download As' in the web interface and `jupyter convert` in a shell. -Pandas DataFrames implement `_repr_html_`and `_repr_latex` methods which +Pandas DataFrames implement `_repr_html_` and `_repr_latex` methods which are utilized by Jupyter Notebook for displaying (abbreviated) HTML or LaTeX tables. LaTeX output is properly escaped. (Note: HTML tables may or may not be compatible with non-HTML Jupyter output formats.) @@ -688,7 +688,7 @@ units aware. ### [Text Extensions](https://ibm.biz/text-extensions-for-pandas) -Text Extensions for Pandas provides extension types to cover common data structures for representing natural language data, plus library integrations that convert the outputs of popular natural language processing libraries into Pandas DataFrames. +Text Extensions for Pandas provides extension types to cover common data structures for representing natural language data, plus library integrations that convert the outputs of popular natural language processing libraries into pandas DataFrames. ## Accessors From b67668e5f7bf6f2e98e19615b00be22c16bc601a Mon Sep 17 00:00:00 2001 From: Shashwat Agrawal <72117025+ShashwatAgrawal20@users.noreply.github.com> Date: Wed, 12 Feb 2025 23:50:27 +0530 Subject: [PATCH 62/68] fix: incorrect ISO week 53 conversion when only 52 weeks exist (#60896) * fix: incorrect ISO week 53 conversion when only 52 weeks exist * test(invalid_iso_week): ready, set, go! * test: removing unwanted errors="raise" as those are the defaults. --- pandas/_libs/tslibs/strptime.pyx | 7 +++++++ pandas/tests/tools/test_to_datetime.py | 24 ++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index ed784b6f5ab22..fb89f1328529d 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -924,6 +924,13 @@ cdef (int, int) _calc_julian_from_V(int iso_year, int iso_week, int iso_weekday) correction = date(iso_year, 1, 4).isoweekday() + 3 ordinal = (iso_week * 7) + iso_weekday - correction + + if iso_week == 53: + now = date.fromordinal(date(iso_year, 1, 1).toordinal() + ordinal - iso_weekday) + jan_4th = date(iso_year+1, 1, 4) + if (jan_4th - now).days < 7: + raise ValueError(f"Week 53 does not exist in ISO year {iso_year}.") + # ordinal may be negative or 0 now, which means the date is in the previous # calendar year if ordinal < 1: diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 566fd8d901569..e039f54960389 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -794,12 +794,36 @@ def test_to_datetime_np_str(self): ["2015-1-1", "%G-%V-%u", datetime(2014, 12, 29, 0, 0)], ["2015-1-4", "%G-%V-%u", datetime(2015, 1, 1, 0, 0)], ["2015-1-7", "%G-%V-%u", datetime(2015, 1, 4, 0, 0)], + ["2024-52-1", "%G-%V-%u", datetime(2024, 12, 23, 0, 0)], + ["2024-52-7", "%G-%V-%u", datetime(2024, 12, 29, 0, 0)], + ["2025-1-1", "%G-%V-%u", datetime(2024, 12, 30, 0, 0)], + ["2020-53-1", "%G-%V-%u", datetime(2020, 12, 28, 0, 0)], ], ) def test_to_datetime_iso_week_year_format(self, s, _format, dt): # See GH#16607 assert to_datetime(s, format=_format) == dt + @pytest.mark.parametrize( + "msg, s, _format", + [ + [ + "Week 53 does not exist in ISO year 2024", + "2024 53 1", + "%G %V %u", + ], + [ + "Week 53 does not exist in ISO year 2023", + "2023 53 1", + "%G %V %u", + ], + ], + ) + def test_invalid_iso_week_53(self, msg, s, _format): + # See GH#60885 + with pytest.raises(ValueError, match=msg): + to_datetime(s, format=_format) + @pytest.mark.parametrize( "msg, s, _format", [ From b601a0c88ce5345c0cfdb8b3c99222c61cea00b7 Mon Sep 17 00:00:00 2001 From: Guilherme Martins Crocetti <24530683+gmcrocetti@users.noreply.github.com> Date: Wed, 12 Feb 2025 17:37:27 -0300 Subject: [PATCH 63/68] refactor: deprecate usage of `cursor.execute` statements in favor of the in class implementation of `execute`. (#60748) * refactor: deprecate usage of `cursor.execute` statements in favor of the in class implementation of `execute`. * chore: using cursor from transaction * using base exceptions * chore: update docs * chore: update whatsnew * chore: using 'import_optional_dependency' to import pyarrow and changing import order * docs: add issue number --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/sql.py | 61 +++++++++++++++++++++++----------- 2 files changed, 42 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 570faa00e97a8..9fa83e6a10813 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -70,6 +70,7 @@ Other enhancements - :meth:`Series.str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) - :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`) +- Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`) - Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`) - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`) - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 5652d7fab0c7c..8e75c61e1744d 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -1651,10 +1651,18 @@ def run_transaction(self): def execute(self, sql: str | Select | TextClause, params=None): """Simple passthrough to SQLAlchemy connectable""" + from sqlalchemy.exc import SQLAlchemyError + args = [] if params is None else [params] if isinstance(sql, str): - return self.con.exec_driver_sql(sql, *args) - return self.con.execute(sql, *args) + execute_function = self.con.exec_driver_sql + else: + execute_function = self.con.execute + + try: + return execute_function(sql, *args) + except SQLAlchemyError as exc: + raise DatabaseError(f"Execution failed on sql '{sql}': {exc}") from exc def read_table( self, @@ -2108,6 +2116,8 @@ def run_transaction(self): self.con.commit() def execute(self, sql: str | Select | TextClause, params=None): + from adbc_driver_manager import Error + if not isinstance(sql, str): raise TypeError("Query must be a string unless using sqlalchemy.") args = [] if params is None else [params] @@ -2115,10 +2125,10 @@ def execute(self, sql: str | Select | TextClause, params=None): try: cur.execute(sql, *args) return cur - except Exception as exc: + except Error as exc: try: self.con.rollback() - except Exception as inner_exc: # pragma: no cover + except Error as inner_exc: # pragma: no cover ex = DatabaseError( f"Execution failed on sql: {sql}\n{exc}\nunable to rollback" ) @@ -2207,8 +2217,7 @@ def read_table( else: stmt = f"SELECT {select_list} FROM {table_name}" - with self.con.cursor() as cur: - cur.execute(stmt) + with self.execute(stmt) as cur: pa_table = cur.fetch_arrow_table() df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) @@ -2278,8 +2287,7 @@ def read_query( if chunksize: raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") - with self.con.cursor() as cur: - cur.execute(sql) + with self.execute(sql) as cur: pa_table = cur.fetch_arrow_table() df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) @@ -2335,6 +2343,9 @@ def to_sql( engine : {'auto', 'sqlalchemy'}, default 'auto' Raises NotImplementedError if not set to 'auto' """ + pa = import_optional_dependency("pyarrow") + from adbc_driver_manager import Error + if index_label: raise NotImplementedError( "'index_label' is not implemented for ADBC drivers" @@ -2364,22 +2375,25 @@ def to_sql( if if_exists == "fail": raise ValueError(f"Table '{table_name}' already exists.") elif if_exists == "replace": - with self.con.cursor() as cur: - cur.execute(f"DROP TABLE {table_name}") + sql_statement = f"DROP TABLE {table_name}" + self.execute(sql_statement).close() elif if_exists == "append": mode = "append" - import pyarrow as pa - try: tbl = pa.Table.from_pandas(frame, preserve_index=index) except pa.ArrowNotImplementedError as exc: raise ValueError("datatypes not supported") from exc with self.con.cursor() as cur: - total_inserted = cur.adbc_ingest( - table_name=name, data=tbl, mode=mode, db_schema_name=schema - ) + try: + total_inserted = cur.adbc_ingest( + table_name=name, data=tbl, mode=mode, db_schema_name=schema + ) + except Error as exc: + raise DatabaseError( + f"Failed to insert records on table={name} with {mode=}" + ) from exc self.con.commit() return total_inserted @@ -2496,9 +2510,9 @@ def sql_schema(self) -> str: return str(";\n".join(self.table)) def _execute_create(self) -> None: - with self.pd_sql.run_transaction() as conn: + with self.pd_sql.run_transaction() as cur: for stmt in self.table: - conn.execute(stmt) + cur.execute(stmt) def insert_statement(self, *, num_rows: int) -> str: names = list(map(str, self.frame.columns)) @@ -2520,8 +2534,13 @@ def insert_statement(self, *, num_rows: int) -> str: return insert_statement def _execute_insert(self, conn, keys, data_iter) -> int: + from sqlite3 import Error + data_list = list(data_iter) - conn.executemany(self.insert_statement(num_rows=1), data_list) + try: + conn.executemany(self.insert_statement(num_rows=1), data_list) + except Error as exc: + raise DatabaseError("Execution failed") from exc return conn.rowcount def _execute_insert_multi(self, conn, keys, data_iter) -> int: @@ -2643,6 +2662,8 @@ def run_transaction(self): cur.close() def execute(self, sql: str | Select | TextClause, params=None): + from sqlite3 import Error + if not isinstance(sql, str): raise TypeError("Query must be a string unless using sqlalchemy.") args = [] if params is None else [params] @@ -2650,10 +2671,10 @@ def execute(self, sql: str | Select | TextClause, params=None): try: cur.execute(sql, *args) return cur - except Exception as exc: + except Error as exc: try: self.con.rollback() - except Exception as inner_exc: # pragma: no cover + except Error as inner_exc: # pragma: no cover ex = DatabaseError( f"Execution failed on sql: {sql}\n{exc}\nunable to rollback" ) From 03056560682bccd2a9e1ea1aa4c8d4914bd0fe89 Mon Sep 17 00:00:00 2001 From: Shashwat Agrawal <72117025+ShashwatAgrawal20@users.noreply.github.com> Date: Thu, 13 Feb 2025 23:11:26 +0530 Subject: [PATCH 64/68] docs: clarify `None` case behavior for `sheet_name` in `pandas.read_excel` (#60924) ready, set, go! --- pandas/io/excel/_base.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index ced2ad91dba1e..460af65a60bf6 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -115,7 +115,7 @@ Strings are used for sheet names. Integers are used in zero-indexed sheet positions (chart sheets do not count as a sheet position). Lists of strings/integers are used to request multiple sheets. - Specify ``None`` to get all worksheets. + When ``None``, will return a dictionary containing DataFrames for each sheet. Available cases: @@ -124,7 +124,7 @@ * ``"Sheet1"``: Load sheet with name "Sheet1" * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5" as a dict of `DataFrame` - * ``None``: All worksheets. + * ``None``: Returns a dictionary containing DataFrames for each sheet.. header : int, list of int, default 0 Row (0-indexed) to use for the column labels of the parsed @@ -1649,7 +1649,8 @@ def parse( Strings are used for sheet names. Integers are used in zero-indexed sheet positions (chart sheets do not count as a sheet position). Lists of strings/integers are used to request multiple sheets. - Specify ``None`` to get all worksheets. + When ``None``, will return a dictionary containing DataFrames for + each sheet. header : int, list of int, default 0 Row (0-indexed) to use for the column labels of the parsed DataFrame. If a list of integers is passed those row positions will From cb33796ee29625a5bb7a8f0449f821591fa2c8ff Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Thu, 13 Feb 2025 22:28:31 +0000 Subject: [PATCH 65/68] DOC: add series.info to api reference (#60926) * docs: add series.info to api reference * fixup --- doc/source/reference/series.rst | 1 + doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/formats/info.py | 7 ++++++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 43d7480899dc4..6006acc8f5e16 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -25,6 +25,7 @@ Attributes Series.array Series.values Series.dtype + Series.info Series.shape Series.nbytes Series.ndim diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 9fa83e6a10813..4d9a45abe17cd 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -32,6 +32,7 @@ Other enhancements - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) - :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`) - Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`) +- Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`) - :class:`pandas.api.typing.NoDefault` is available for typing ``no_default`` - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index b4c6ff8792d52..c9a6e94a0c7c1 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -226,12 +226,17 @@ Series.describe: Generate descriptive statistics of Series. Series.memory_usage: Memory usage of Series.""" ) +series_max_cols_sub = dedent( + """\ + max_cols : int, optional + Unused, exists only for compatibility with DataFrame.info.""" +) series_sub_kwargs = { "klass": "Series", "type_sub": "", - "max_cols_sub": "", + "max_cols_sub": series_max_cols_sub, "show_counts_sub": show_counts_sub, "examples_sub": series_examples_sub, "see_also_sub": series_see_also_sub, From c73b380501220a692523d50cf9e74a9998a50c5a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 13 Feb 2025 17:10:56 -0800 Subject: [PATCH 66/68] TST: Update numpy version check for test_pandas_dtype_numpy_warning (#60929) * TST: Update numpy version check for test_pandas_dtype_numpy_warning * less than the dev version --- pandas/core/dtypes/common.py | 2 +- pandas/tests/dtypes/test_common.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index e8881ff014a0c..e92f2363b69f1 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1836,7 +1836,7 @@ def pandas_dtype(dtype) -> DtypeObj: # raise a consistent TypeError if failed try: with warnings.catch_warnings(): - # TODO: warnings.catch_warnings can be removed when numpy>2.2.2 + # TODO: warnings.catch_warnings can be removed when numpy>2.3.0 # is the minimum version # GH#51523 - Series.astype(np.integer) doesn't show # numpy deprecation warning of np.integer diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 2bda2fddec2ff..d30fa9fc2ea0f 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -789,7 +789,7 @@ def test_validate_allhashable(): def test_pandas_dtype_numpy_warning(): # GH#51523 - if Version(np.__version__) <= Version("2.2.2"): + if Version(np.__version__) < Version("2.3.0.dev0"): ctx = tm.assert_produces_warning( DeprecationWarning, check_stacklevel=False, From 19ea997815d4dadf490d7052a0a3c289be898588 Mon Sep 17 00:00:00 2001 From: "Christine P. Chai" Date: Thu, 13 Feb 2025 17:55:05 -0800 Subject: [PATCH 67/68] DOC: Update two more links in pandas Ecosystem (#60931) --- web/pandas/community/ecosystem.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index c0bd28d021f34..2ad8d6243db55 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -158,7 +158,7 @@ df = pd.read_csv("data.csv") df # discover interesting insights! ``` -By printing out a dataframe, Lux automatically [recommends a set of visualizations](https://raw.githubusercontent.com/lux-org/lux-resources/master/readme_img/demohighlight.gif) that highlights interesting trends and patterns in the dataframe. Users can leverage any existing pandas commands without modifying their code, while being able to visualize their pandas data structures (e.g., DataFrame, Series, Index) at the same time. Lux also offers a [powerful, intuitive language](https://lux-api.readthedocs.io/en/latest/source/guide/vis.html>) that allow users to create Altair, matplotlib, or Vega-Lite visualizations without having to think at the level of code. +By printing out a dataframe, Lux automatically [recommends a set of visualizations](https://raw.githubusercontent.com/lux-org/lux-resources/master/readme_img/demohighlight.gif) that highlights interesting trends and patterns in the dataframe. Users can leverage any existing pandas commands without modifying their code, while being able to visualize their pandas data structures (e.g., DataFrame, Series, Index) at the same time. Lux also offers a [powerful, intuitive language](https://lux-api.readthedocs.io/en/latest/source/guide/vis.html) that allow users to create Altair, matplotlib, or Vega-Lite visualizations without having to think at the level of code. ### [D-Tale](https://github.com/man-group/dtale) @@ -342,7 +342,7 @@ It supports the following data types: - pandas data types - data types defined in the [NTV format](https://loco-philippe.github.io/ES/JSON%20semantic%20format%20(JSON-NTV).htm) -- data types defined in [Table Schema specification](http://dataprotocols.org/json-table-schema/#field-types-and-formats) +- data types defined in [Table Schema specification](https://datapackage.org/standard/table-schema/) The interface is always reversible (conversion round trip) with two formats (JSON-NTV and JSON-TableSchema). From 6bcd30397d67c3887288c7a82c2c235ce8bc3c7f Mon Sep 17 00:00:00 2001 From: "Christine P. Chai" Date: Fri, 14 Feb 2025 10:19:49 -0800 Subject: [PATCH 68/68] DOC: Correct typos in developer guide for consistency (#60932) * DOC: Correct a typo in contributing_codebase.rst * DOC: Correct another typo in developer.rst --- doc/source/development/contributing_codebase.rst | 2 +- doc/source/development/developer.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index c1cfb0d7a623b..143aebd8f236a 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -344,7 +344,7 @@ be located. - tests.scalar - tests.tseries.offsets -2. Does your test depend only on code in pd._libs? +2. Does your test depend only on code in ``pd._libs``? This test likely belongs in one of: - tests.libs diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst index 6de237b70f08d..c5c4b7c449ce7 100644 --- a/doc/source/development/developer.rst +++ b/doc/source/development/developer.rst @@ -99,7 +99,7 @@ Column metadata * Boolean: ``'bool'`` * Integers: ``'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64'`` * Floats: ``'float16', 'float32', 'float64'`` -* Date and Time Types: ``'datetime', 'datetimetz'``, ``'timedelta'`` +* Date and Time Types: ``'datetime', 'datetimetz', 'timedelta'`` * String: ``'unicode', 'bytes'`` * Categorical: ``'categorical'`` * Other Python objects: ``'object'``