Merge remote-tracking branch 'upstream/main' into sty/ruff

pandas-dev · Jan 30, 2024 · b40d961 · b40d961
2 parents 23e5d93 + 9b95e45
commit b40d961
Show file tree

Hide file tree

Showing 69 changed files with 600 additions and 374 deletions.
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
@@ -139,7 +139,7 @@ jobs:
 
       - name: Build normal wheels
         if: ${{ (env.IS_SCHEDULE_DISPATCH != 'true' || env.IS_PUSH == 'true') }}
-        uses: pypa/cibuildwheel@v2.16.2
+        uses: pypa/cibuildwheel@v2.16.4
         with:
          package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }}
         env:
@@ -148,7 +148,7 @@ jobs:
 
       - name: Build nightly wheels (with NumPy pre-release)
         if: ${{ (env.IS_SCHEDULE_DISPATCH == 'true' && env.IS_PUSH != 'true') }}
-        uses: pypa/cibuildwheel@v2.16.2
+        uses: pypa/cibuildwheel@v2.16.4
         with:
          package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }}
         env:

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
@@ -408,6 +408,9 @@ def time_read_stringcsv(self, engine):
     def time_read_bytescsv(self, engine):
         read_csv(self.data(self.BytesIO_input), engine=engine)
 
+    def peakmem_read_csv(self, engine):
+        read_csv(self.data(self.BytesIO_input), engine=engine)
+
 
 class ReadCSVCategorical(BaseIO):
     fname = "__test__.csv"

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -95,7 +95,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         pandas.PeriodIndex.strftime\
         pandas.Series.clip\
         pandas.Series.rename_axis\
-        pandas.Series.interpolate\
         pandas.Series.dt.to_period\
         pandas.Series.dt.tz_localize\
         pandas.Series.dt.tz_convert\
@@ -162,9 +161,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         pandas.tseries.offsets.Milli\
         pandas.tseries.offsets.Micro\
         pandas.tseries.offsets.Nano\
-        pandas.describe_option\
-        pandas.reset_option\
-        pandas.get_option\
         pandas.set_option\
         pandas.Timestamp.max\
         pandas.Timestamp.min\
@@ -186,8 +182,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         pandas.core.groupby.DataFrameGroupBy.hist\
         pandas.core.groupby.DataFrameGroupBy.plot\
         pandas.core.groupby.SeriesGroupBy.plot\
-        pandas.read_hdf\
-        pandas.HDFStore.append\
         pandas.core.window.rolling.Rolling.quantile\
         pandas.core.window.expanding.Expanding.quantile\
         pandas.api.extensions.ExtensionArray.argsort # There should be no backslash in the final line, please keep this comment in the last ignored function

diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -460,7 +460,6 @@
         "dateutil": ("https://dateutil.readthedocs.io/en/latest/", None),
         "matplotlib": ("https://matplotlib.org/stable/", None),
         "numpy": ("https://numpy.org/doc/stable/", None),
-        "py": ("https://pylib.readthedocs.io/en/latest/", None),
         "python": ("https://docs.python.org/3/", None),
         "scipy": ("https://docs.scipy.org/doc/scipy/", None),
         "pyarrow": ("https://arrow.apache.org/docs/", None),

diff --git a/doc/source/development/community.rst b/doc/source/development/community.rst
@@ -112,7 +112,7 @@ people who are hesitant to bring up their questions or ideas on a large public
 mailing list or GitHub.
 
 If this sounds like the right place for you, you are welcome to join using
-`this link <https://join.slack.com/t/pandas-dev-community/shared_invite/zt-1e2qgy1r6-PLCN8UOLEUAYoLdAsaJilw>`_!
+`this link <https://join.slack.com/t/pandas-dev-community/shared_invite/zt-2blg6u9k3-K6_XvMRDZWeH7Id274UeIg>`_!
 Please remember to follow our `Code of Conduct <https://pandas.pydata.org/community/coc.html>`_,
 and be aware that our admins are monitoring for irrelevant messages and will remove folks who use
 our

diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
@@ -277,11 +277,12 @@ Installable with ``pip install "pandas[excel]"``.
 ========================= ================== =============== =============================================================
 Dependency                Minimum Version    pip extra       Notes
 ========================= ================== =============== =============================================================
-xlrd                      2.0.1              excel           Reading Excel
-xlsxwriter                3.0.5              excel           Writing Excel
-openpyxl                  3.1.0              excel           Reading / writing for xlsx files
+xlrd                      2.0.1              excel           Reading for xls files
+xlsxwriter                3.0.5              excel           Writing for xlsx files
+openpyxl                  3.1.0              excel           Reading / writing for Excel 2010 xlsx/xlsm/xltx/xltm files
 pyxlsb                    1.0.10             excel           Reading for xlsb files
-python-calamine           0.1.7              excel           Reading for xls/xlsx/xlsb/ods files
+python-calamine           0.1.7              excel           Reading for xls/xlsx/xlsm/xlsb/xla/xlam/ods files
+odfpy                     1.4.1              excel           Reading / writing for OpenDocument 1.2 files
 ========================= ================== =============== =============================================================
 
 HTML

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -61,8 +61,8 @@ Basic
 +++++
 
 filepath_or_buffer : various
-  Either a path to a file (a :class:`python:str`, :class:`python:pathlib.Path`,
-  or :class:`py:py._path.local.LocalPath`), URL (including http, ftp, and S3
+  Either a path to a file (a :class:`python:str`, :class:`python:pathlib.Path`)
+  URL (including http, ftp, and S3
   locations), or any object with a ``read()`` method (such as an open file or
   :class:`~python:io.StringIO`).
 sep : str, defaults to ``','`` for :func:`read_csv`, ``\t`` for :func:`read_table`

diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst
@@ -30,7 +30,7 @@ use :class:`api.types.NaTType`.
    pd.Series(["2020", "2020"], dtype=pd.PeriodDtype("D")).reindex([0, 1, 2])
 
 :class:`NA` for :class:`StringDtype`, :class:`Int64Dtype` (and other bit widths),
-:class:`Float64Dtype`(and other bit widths), :class:`BooleanDtype` and :class:`ArrowDtype`.
+:class:`Float64Dtype` (and other bit widths), :class:`BooleanDtype` and :class:`ArrowDtype`.
 These types will maintain the original data type of the data.
 For typing applications, use :class:`api.types.NAType`.
 

diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst
@@ -13,23 +13,30 @@ including other versions of pandas.
 
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
+- Fixed memory leak in :func:`read_csv` (:issue:`57039`)
 - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`)
 - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`)
+- Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`)
+- Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`)
+- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`)
+- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`)
+- Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`)
 - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_221.bug_fixes:
 
 Bug fixes
 ~~~~~~~~~
--
+- Fixed bug in :meth:`DataFrame.__getitem__` for empty :class:`DataFrame` with Copy-on-Write enabled (:issue:`57130`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_221.other:
 
 Other
 ~~~~~
--
+- Added the argument ``skipna`` to :meth:`DataFrameGroupBy.first`, :meth:`DataFrameGroupBy.last`, :meth:`SeriesGroupBy.first`, and :meth:`SeriesGroupBy.last`; achieving ``skipna=False`` used to be available via :meth:`DataFrameGroupBy.nth`, but the behavior was changed in pandas 2.0.0 (:issue:`57019`)
+- Added the argument ``skipna`` to :meth:`Resampler.first`, :meth:`Resampler.last` (:issue:`57019`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_221.contributors:

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -84,7 +84,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor
 
 Other API changes
 ^^^^^^^^^^^^^^^^^
--
+- 3rd party ``py.path`` objects are no longer explicitly supported in IO methods. Use :py:class:`pathlib.Path` objects instead (:issue:`57091`)
 -
 
 .. ---------------------------------------------------------------------------
@@ -142,7 +142,7 @@ Timezones
 
 Numeric
 ^^^^^^^
--
+- Bug in ``np.matmul`` with :class:`Index` inputs raising a ``TypeError`` (:issue:`57079`)
 -
 
 Conversion
@@ -152,7 +152,7 @@ Conversion
 
 Strings
 ^^^^^^^
--
+- Bug in :meth:`Series.value_counts` would not respect ``sort=False`` for series having ``string`` dtype (:issue:`55224`)
 -
 
 Interval

diff --git a/pandas/_config/config.py b/pandas/_config/config.py
@@ -54,14 +54,17 @@
     ContextDecorator,
     contextmanager,
 )
+from inspect import signature
 import re
 from typing import (
     TYPE_CHECKING,
     Any,
     Callable,
     Generic,
+    Literal,
     NamedTuple,
     cast,
+    overload,
 )
 import warnings
 
@@ -270,6 +273,7 @@ class CallableDynamicDoc(Generic[T]):
     def __init__(self, func: Callable[..., T], doc_tmpl: str) -> None:
         self.__doc_tmpl__ = doc_tmpl
         self.__func__ = func
+        self.__signature__ = signature(func)
 
     def __call__(self, *args, **kwds) -> T:
         return self.__func__(*args, **kwds)
@@ -740,7 +744,23 @@ def _build_option_description(k: str) -> str:
     return s
 
 
-def pp_options_list(keys: Iterable[str], width: int = 80, _print: bool = False):
+@overload
+def pp_options_list(
+    keys: Iterable[str], *, width: int = ..., _print: Literal[False] = ...
+) -> str:
+    ...
+
+
+@overload
+def pp_options_list(
+    keys: Iterable[str], *, width: int = ..., _print: Literal[True]
+) -> None:
+    ...
+
+
+def pp_options_list(
+    keys: Iterable[str], *, width: int = 80, _print: bool = False
+) -> str | None:
     """Builds a concise listing of available options, grouped by prefix"""
     from itertools import groupby
     from textwrap import wrap
@@ -770,8 +790,7 @@ def pp(name: str, ks: Iterable[str]) -> list[str]:
     s = "\n".join(ls)
     if _print:
         print(s)
-    else:
-        return s
+    return s
 
 
 #

diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi
@@ -14,9 +14,9 @@ class NDArrayBacked:
     _ndarray: np.ndarray
     def __init__(self, values: np.ndarray, dtype: DtypeObj) -> None: ...
     @classmethod
-    def _simple_new(cls, values: np.ndarray, dtype: DtypeObj): ...
-    def _from_backing_data(self, values: np.ndarray): ...
-    def __setstate__(self, state): ...
+    def _simple_new(cls, values: np.ndarray, dtype: DtypeObj) -> Self: ...
+    def _from_backing_data(self, values: np.ndarray) -> Self: ...
+    def __setstate__(self, state) -> None: ...
     def __len__(self) -> int: ...
     @property
     def shape(self) -> Shape: ...
@@ -26,14 +26,14 @@ class NDArrayBacked:
     def size(self) -> int: ...
     @property
     def nbytes(self) -> int: ...
-    def copy(self, order=...): ...
-    def delete(self, loc, axis=...): ...
-    def swapaxes(self, axis1, axis2): ...
-    def repeat(self, repeats: int | Sequence[int], axis: int | None = ...): ...
-    def reshape(self, *args, **kwargs): ...
-    def ravel(self, order=...): ...
+    def copy(self, order=...) -> Self: ...
+    def delete(self, loc, axis=...) -> Self: ...
+    def swapaxes(self, axis1, axis2) -> Self: ...
+    def repeat(self, repeats: int | Sequence[int], axis: int | None = ...) -> Self: ...
+    def reshape(self, *args, **kwargs) -> Self: ...
+    def ravel(self, order=...) -> Self: ...
     @property
-    def T(self): ...
+    def T(self) -> Self: ...
     @classmethod
     def _concat_same_type(
         cls, to_concat: Sequence[Self], axis: AxisInt = ...

diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
@@ -136,6 +136,7 @@ def group_last(
     result_mask: npt.NDArray[np.bool_] | None = ...,
     min_count: int = ...,  # Py_ssize_t
     is_datetimelike: bool = ...,
+    skipna: bool = ...,
 ) -> None: ...
 def group_nth(
     out: np.ndarray,  # rank_t[:, ::1]
@@ -147,6 +148,7 @@ def group_nth(
     min_count: int = ...,  # int64_t
     rank: int = ...,  # int64_t
     is_datetimelike: bool = ...,
+    skipna: bool = ...,
 ) -> None: ...
 def group_rank(
     out: np.ndarray,  # float64_t[:, ::1]