Merge branch 'main' of https://github.com/Anurag-Varma/pandas into bu…

…g#60723
pandas-dev · Feb 22, 2025 · c73a931 · c73a931
2 parents f98a814 + 48b1571
commit c73a931
Show file tree

Hide file tree

Showing 37 changed files with 340 additions and 168 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -4,9 +4,6 @@
 # ci
 ci/                               @mroeschke
 
-# web
-web/                              @datapythonista
-
 # docs
 doc/cheatsheet                    @Dr-Irv
 doc/source/development            @noatamir

diff --git a/doc/source/development/contributing_gitpod.rst b/doc/source/development/contributing_gitpod.rst
@@ -109,7 +109,7 @@ development experience:
 
 *  `VSCode rst extension <https://marketplace.visualstudio.com/items?itemName=lextudio.restructuredtext>`_
 *  `Markdown All in One <https://marketplace.visualstudio.com/items?itemName=yzhang.markdown-all-in-one>`_
-*  `VSCode Gitlens extension <https://marketplace.visualstudio.com/items?itemName=eamodio.gitlens>`_
+*  `VSCode GitLens extension <https://marketplace.visualstudio.com/items?itemName=eamodio.gitlens>`_
 *  `VSCode Git Graph extension <https://marketplace.visualstudio.com/items?itemName=mhutchie.git-graph>`_
 
 Development workflow with Gitpod

diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst
@@ -906,7 +906,7 @@ resetting indexes.
 Joining multiple :class:`DataFrame`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-A list or tuple of ``:class:`DataFrame``` can also be passed to :meth:`~DataFrame.join`
+A list or tuple of :class:`DataFrame` can also be passed to :meth:`~DataFrame.join`
 to join them together on their indexes.
 
 .. ipython:: python

diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst
@@ -70,7 +70,8 @@ which will first group the data by the specified keys and then perform a windowi
 
     Some windowing aggregation, ``mean``, ``sum``, ``var`` and ``std`` methods may suffer from numerical
     imprecision due to the underlying windowing algorithms accumulating sums. When values differ
-    with magnitude :math:`1/np.finfo(np.double).eps` this results in truncation. It must be
+    with magnitude ``1/np.finfo(np.double).eps`` (approximately :math:`4.5 \times 10^{15}`),
+    this results in truncation. It must be
     noted, that large values may have an impact on windows, which do not include these values. `Kahan summation
     <https://en.wikipedia.org/wiki/Kahan_summation_algorithm>`__ is used
     to compute the rolling sums to preserve accuracy as much as possible.

diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -37,7 +37,8 @@ Other enhancements
   updated to work correctly with NumPy >= 2 (:issue:`57739`)
 - :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`)
 - :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype``  (:issue:`60663`)
-- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`)
+- The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`)
+- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`)
 - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
 
 .. ---------------------------------------------------------------------------

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -71,6 +71,7 @@ Other enhancements
 - :meth:`Series.str.get_dummies` now accepts a  ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
 - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
 - :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`)
+- Add ``"delete_rows"`` option to ``if_exists`` argument in :meth:`DataFrame.to_sql` deleting all records of the table before inserting data (:issue:`37210`).
 - Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`)
 - Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`)
 - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
@@ -357,6 +358,7 @@ Other API changes
 - Made ``dtype`` a required argument in :meth:`ExtensionArray._from_sequence_of_strings` (:issue:`56519`)
 - Passing a :class:`Series` input to :func:`json_normalize` will now retain the :class:`Series` :class:`Index`, previously output had a new :class:`RangeIndex` (:issue:`51452`)
 - Removed :meth:`Index.sort` which always raised a ``TypeError``. This attribute is not defined and will raise an ``AttributeError`` (:issue:`59283`)
+- Unused ``dtype`` argument has been removed from the :class:`MultiIndex` constructor (:issue:`60962`)
 - Updated :meth:`DataFrame.to_excel` so that the output spreadsheet has no styling. Custom styling can still be done using :meth:`Styler.to_excel` (:issue:`54154`)
 - pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`)
 - pickled objects from pandas version less than ``1.0.0`` are no longer supported (:issue:`57155`)
@@ -787,6 +789,7 @@ Sparse
 
 ExtensionArray
 ^^^^^^^^^^^^^^
+- Bug in :class:`Categorical` when constructing with an :class:`Index` with :class:`ArrowDtype` (:issue:`60563`)
 - Bug in :meth:`.arrays.ArrowExtensionArray.__setitem__` which caused wrong behavior when using an integer array with repeated values as a key (:issue:`58530`)
 - Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`)
 - Bug in comparison between object with :class:`ArrowDtype` and incompatible-dtyped (e.g. string vs bool) incorrectly raising instead of returning all-``False`` (for ``==``) or all-``True`` (for ``!=``) (:issue:`59505`)
@@ -816,6 +819,7 @@ Other
 - Bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`)
 - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`)
 - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`)
+- Bug in :meth:`MultiIndex.fillna` error message was referring to ``isna`` instead of ``fillna`` (:issue:`60974`)
 - Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`)
 - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`)
 - Bug in :meth:`Series.isin` raising ``TypeError`` when series is large (>10**6) and ``values`` contains NA (:issue:`60678`)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -818,33 +818,7 @@ def is_monotonic(const numeric_object_t[:] arr, bint timelike):
     if timelike and <int64_t>arr[0] == NPY_NAT:
         return False, False, False
 
-    if numeric_object_t is not object:
-        with nogil:
-            prev = arr[0]
-            for i in range(1, n):
-                cur = arr[i]
-                if timelike and <int64_t>cur == NPY_NAT:
-                    is_monotonic_inc = 0
-                    is_monotonic_dec = 0
-                    break
-                if cur < prev:
-                    is_monotonic_inc = 0
-                elif cur > prev:
-                    is_monotonic_dec = 0
-                elif cur == prev:
-                    is_unique = 0
-                else:
-                    # cur or prev is NaN
-                    is_monotonic_inc = 0
-                    is_monotonic_dec = 0
-                    break
-                if not is_monotonic_inc and not is_monotonic_dec:
-                    is_monotonic_inc = 0
-                    is_monotonic_dec = 0
-                    break
-                prev = cur
-    else:
-        # object-dtype, identical to above except we cannot use `with nogil`
+    with nogil(numeric_object_t is not object):
         prev = arr[0]
         for i in range(1, n):
             cur = arr[i]

diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in
@@ -415,20 +415,7 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
 
     modes = np.empty(nkeys, dtype=values.dtype)
 
-    if htfunc_t is not object:
-        with nogil:
-            for k in range(nkeys):
-                count = counts[k]
-                if count == max_count:
-                    j += 1
-                elif count > max_count:
-                    max_count = count
-                    j = 0
-                else:
-                    continue
-
-                modes[j] = keys[k]
-    else:
+    with nogil(htfunc_t is not object):
         for k in range(nkeys):
             count = counts[k]
             if count == max_count:

diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx
@@ -502,7 +502,7 @@ def get_concat_blkno_indexers(list blknos_list not None):
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def get_blkno_indexers(
-    int64_t[:] blknos, bint group=True
+    const int64_t[:] blknos, bint group=True
 ) -> list[tuple[int, slice | np.ndarray]]:
     """
     Enumerate contiguous runs of integers in ndarray.
@@ -596,8 +596,8 @@ def get_blkno_placements(blknos, group: bool = True):
 @cython.boundscheck(False)
 @cython.wraparound(False)
 cpdef update_blklocs_and_blknos(
-    ndarray[intp_t, ndim=1] blklocs,
-    ndarray[intp_t, ndim=1] blknos,
+    const intp_t[:] blklocs,
+    const intp_t[:] blknos,
     Py_ssize_t loc,
     intp_t nblocks,
 ):

diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx
@@ -225,7 +225,10 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-cdef void _get_result_indexer(intp_t[::1] sorter, intp_t[::1] indexer) noexcept nogil:
+cdef void _get_result_indexer(
+    const intp_t[::1] sorter,
+    intp_t[::1] indexer,
+) noexcept nogil:
     """NOTE: overwrites indexer with the result to avoid allocating another array"""
     cdef:
         Py_ssize_t i, n, idx
@@ -681,8 +684,8 @@ def outer_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t]
 from pandas._libs.hashtable cimport Int64HashTable
 
 
-def asof_join_backward_on_X_by_Y(ndarray[numeric_t] left_values,
-                                 ndarray[numeric_t] right_values,
+def asof_join_backward_on_X_by_Y(const numeric_t[:] left_values,
+                                 const numeric_t[:] right_values,
                                  const int64_t[:] left_by_values,
                                  const int64_t[:] right_by_values,
                                  bint allow_exact_matches=True,
@@ -752,8 +755,8 @@ def asof_join_backward_on_X_by_Y(ndarray[numeric_t] left_values,
     return left_indexer, right_indexer
 
 
-def asof_join_forward_on_X_by_Y(ndarray[numeric_t] left_values,
-                                ndarray[numeric_t] right_values,
+def asof_join_forward_on_X_by_Y(const numeric_t[:] left_values,
+                                const numeric_t[:] right_values,
                                 const int64_t[:] left_by_values,
                                 const int64_t[:] right_by_values,
                                 bint allow_exact_matches=1,
@@ -824,8 +827,8 @@ def asof_join_forward_on_X_by_Y(ndarray[numeric_t] left_values,
     return left_indexer, right_indexer
 
 
-def asof_join_nearest_on_X_by_Y(ndarray[numeric_t] left_values,
-                                ndarray[numeric_t] right_values,
+def asof_join_nearest_on_X_by_Y(const numeric_t[:] left_values,
+                                const numeric_t[:] right_values,
                                 const int64_t[:] left_by_values,
                                 const int64_t[:] right_by_values,
                                 bint allow_exact_matches=True,

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -981,16 +981,14 @@ def get_level_sorter(
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
+def count_level_2d(const uint8_t[:, :] mask,
                    const intp_t[:] labels,
                    Py_ssize_t max_bin,
                    ):
     cdef:
-        Py_ssize_t i, j, k, n
+        Py_ssize_t i, j, k = mask.shape[1], n = mask.shape[0]
         ndarray[int64_t, ndim=2] counts
 
-    n, k = (<object>mask).shape
-
     counts = np.zeros((n, max_bin), dtype="i8")
     with nogil:
         for i in range(n):

diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx
@@ -40,27 +40,7 @@ def unstack(const numeric_object_t[:, :] values, const uint8_t[:] mask,
     cdef:
         Py_ssize_t i, j, w, nulls, s, offset
 
-    if numeric_object_t is not object:
-        # evaluated at compile-time
-        with nogil:
-            for i in range(stride):
-
-                nulls = 0
-                for j in range(length):
-
-                    for w in range(width):
-
-                        offset = j * width + w
-
-                        if mask[offset]:
-                            s = i * width + w
-                            new_values[j, s] = values[offset - nulls, i]
-                            new_mask[j, s] = 1
-                        else:
-                            nulls += 1
-
-    else:
-        # object-dtype, identical to above but we cannot use nogil
+    with nogil(numeric_object_t is not object):
         for i in range(stride):
 
             nulls = 0

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -447,7 +447,12 @@ def __init__(
             if isinstance(values.dtype, ArrowDtype) and issubclass(
                 values.dtype.type, CategoricalDtypeType
             ):
-                arr = values._pa_array.combine_chunks()
+                from pandas import Index
+
+                if isinstance(values, Index):
+                    arr = values._data._pa_array.combine_chunks()
+                else:
+                    arr = values._pa_array.combine_chunks()
                 categories = arr.dictionary.to_pandas(types_mapper=ArrowDtype)
                 codes = arr.indices.to_numpy()
                 dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -49,6 +49,7 @@
 )
 
 from pandas.core import (
+    missing,
     nanops,
     ops,
 )
@@ -870,6 +871,88 @@ def _reduce(
 
         raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
 
+    def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArray:
+        """
+        Return an ExtensionArray performing an accumulation operation.
+
+        The underlying data type might change.
+
+        Parameters
+        ----------
+        name : str
+            Name of the function, supported values are:
+            - cummin
+            - cummax
+            - cumsum
+            - cumprod
+        skipna : bool, default True
+            If True, skip NA values.
+        **kwargs
+            Additional keyword arguments passed to the accumulation function.
+            Currently, there is no supported kwarg.
+
+        Returns
+        -------
+        array
+
+        Raises
+        ------
+        NotImplementedError : subclass does not define accumulations
+        """
+        if name == "cumprod":
+            msg = f"operation '{name}' not supported for dtype '{self.dtype}'"
+            raise TypeError(msg)
+
+        # We may need to strip out trailing NA values
+        tail: np.ndarray | None = None
+        na_mask: np.ndarray | None = None
+        ndarray = self._ndarray
+        np_func = {
+            "cumsum": np.cumsum,
+            "cummin": np.minimum.accumulate,
+            "cummax": np.maximum.accumulate,
+        }[name]
+
+        if self._hasna:
+            na_mask = cast("npt.NDArray[np.bool_]", isna(ndarray))
+            if np.all(na_mask):
+                return type(self)(ndarray)
+            if skipna:
+                if name == "cumsum":
+                    ndarray = np.where(na_mask, "", ndarray)
+                else:
+                    # We can retain the running min/max by forward/backward filling.
+                    ndarray = ndarray.copy()
+                    missing.pad_or_backfill_inplace(
+                        ndarray,
+                        method="pad",
+                        axis=0,
+                    )
+                    missing.pad_or_backfill_inplace(
+                        ndarray,
+                        method="backfill",
+                        axis=0,
+                    )
+            else:
+                # When not skipping NA values, the result should be null from
+                # the first NA value onward.
+                idx = np.argmax(na_mask)
+                tail = np.empty(len(ndarray) - idx, dtype="object")
+                tail[:] = self.dtype.na_value
+                ndarray = ndarray[:idx]
+
+        # mypy: Cannot call function of unknown type
+        np_result = np_func(ndarray)  # type: ignore[operator]
+
+        if tail is not None:
+            np_result = np.hstack((np_result, tail))
+        elif na_mask is not None:
+            # Argument 2 to "where" has incompatible type "NAType | float"
+            np_result = np.where(na_mask, self.dtype.na_value, np_result)  # type: ignore[arg-type]
+
+        result = type(self)(np_result)
+        return result
+
     def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any:
         if self.dtype.na_value is np.nan and result is libmissing.NA:
             # the masked_reductions use pd.NA -> convert to np.nan

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -2801,6 +2801,12 @@ def to_sql(
         Databases supported by SQLAlchemy [1]_ are supported. Tables can be
         newly created, appended to, or overwritten.
 
+        .. warning::
+            The pandas library does not attempt to sanitize inputs provided via a to_sql call.
+            Please refer to the documentation for the underlying database driver to see if it
+            will properly prevent injection, or alternatively be advised of a security risk when
+            executing arbitrary commands in a to_sql call.
+
         Parameters
         ----------
         name : str

diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -212,8 +212,6 @@ class MultiIndex(Index):
         level).
     names : optional sequence of objects
         Names for each of the index levels. (name is accepted for compat).
-    dtype : Numpy dtype or pandas type, optional
-        Data type for the MultiIndex.
     copy : bool, default False
         Copy the meta-data.
     name : Label
@@ -305,7 +303,6 @@ def __new__(
         codes=None,
         sortorder=None,
         names=None,
-        dtype=None,
         copy: bool = False,
         name=None,
         verify_integrity: bool = True,
@@ -1760,7 +1757,7 @@ def fillna(self, value):
         """
         fillna is not implemented for MultiIndex
         """
-        raise NotImplementedError("isna is not defined for MultiIndex")
+        raise NotImplementedError("fillna is not defined for MultiIndex")
 
     @doc(Index.dropna)
     def dropna(self, how: AnyAll = "any") -> MultiIndex: