From 840cb1fadbe40bac50090d78cc777f32503e3852 Mon Sep 17 00:00:00 2001 From: Ashar Khan Date: Tue, 25 Feb 2025 04:29:39 +0500 Subject: [PATCH 1/3] Fix arrow groupby na (#60777) * BUG: Fix factorize to ensure proper use of null_encoding parameter * DOC: Add whatsnew entry for dictionary array NA handling fix * BUG: Fix factorize to ensure proper use of null_encoding parameter and backwards compatibility maintained * DOC: Improve rst file and test case comments for arrow groupby NA fix --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/arrays/arrow/array.py | 7 ++++++- pandas/tests/extension/test_arrow.py | 12 ++++++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index edd205860b4e4..7ebcf18a36a96 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -790,6 +790,7 @@ ExtensionArray ^^^^^^^^^^^^^^ - Bug in :class:`Categorical` when constructing with an :class:`Index` with :class:`ArrowDtype` (:issue:`60563`) - Bug in :meth:`.arrays.ArrowExtensionArray.__setitem__` which caused wrong behavior when using an integer array with repeated values as a key (:issue:`58530`) +- Bug in :meth:`ArrowExtensionArray.factorize` where NA values were dropped when input was dictionary-encoded even when dropna was set to False(:issue:`60567`) - Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`) - Bug in comparison between object with :class:`ArrowDtype` and incompatible-dtyped (e.g. string vs bool) incorrectly raising instead of returning all-``False`` (for ``==``) or all-``True`` (for ``!=``) (:issue:`59505`) - Bug in constructing pandas data structures when passing into ``dtype`` a string of the type followed by ``[pyarrow]`` while PyArrow is not installed would raise ``NameError`` rather than ``ImportError`` (:issue:`57928`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 0b546bed1c2b7..e2feda495c103 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1208,7 +1208,12 @@ def factorize( data = data.cast(pa.int64()) if pa.types.is_dictionary(data.type): - encoded = data + if null_encoding == "encode": + # dictionary encode does nothing if an already encoded array is given + data = data.cast(data.type.value_type) + encoded = data.dictionary_encode(null_encoding=null_encoding) + else: + encoded = data else: encoded = data.dictionary_encode(null_encoding=null_encoding) if encoded.length() == 0: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index f4a63ff4c92ec..fbd3868f62899 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3329,6 +3329,18 @@ def test_factorize_chunked_dictionary(): tm.assert_index_equal(res_uniques, exp_uniques) +def test_factorize_dictionary_with_na(): + # GH#60567 + arr = pd.array( + ["a1", pd.NA], dtype=ArrowDtype(pa.dictionary(pa.int32(), pa.utf8())) + ) + indices, uniques = arr.factorize(use_na_sentinel=False) + expected_indices = np.array([0, 1], dtype=np.intp) + expected_uniques = pd.array(["a1", None], dtype=ArrowDtype(pa.string())) + tm.assert_numpy_array_equal(indices, expected_indices) + tm.assert_extension_array_equal(uniques, expected_uniques) + + def test_dictionary_astype_categorical(): # GH#56672 arrs = [ From d246fe79358f6eb5e7e509991e0f866fb6518635 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Tue, 25 Feb 2025 23:14:27 +0530 Subject: [PATCH 2/3] DOC: fix ES01 for pandas.DataFrame.astype (#61002) --- pandas/core/generic.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ccd801e252f2c..81fefe8b8f999 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6267,6 +6267,11 @@ def astype( """ Cast a pandas object to a specified dtype ``dtype``. + This method allows the conversion of the data types of pandas objects, + including DataFrames and Series, to the specified dtype. It supports casting + entire objects to a single data type or applying different data types to + individual columns using a mapping. + Parameters ---------- dtype : str, data type, Series or Mapping of column name -> data type From 10762c6607b5b5807db34571f889f5e91a57127c Mon Sep 17 00:00:00 2001 From: ChiLin Chiu Date: Wed, 26 Feb 2025 01:51:53 +0800 Subject: [PATCH 3/3] DOC: Fix missing a closing bracket in contributing codebase (#61004) --- doc/source/development/contributing_codebase.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 143aebd8f236a..45d4e24b0df51 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -198,7 +198,7 @@ In some cases you may be tempted to use ``cast`` from the typing module when you obj = cast(str, obj) # Mypy complains without this! return obj.upper() -The limitation here is that while a human can reasonably understand that ``is_number`` would catch the ``int`` and ``float`` types mypy cannot make that same inference just yet (see `mypy #5206 `_. While the above works, the use of ``cast`` is **strongly discouraged**. Where applicable a refactor of the code to appease static analysis is preferable +The limitation here is that while a human can reasonably understand that ``is_number`` would catch the ``int`` and ``float`` types mypy cannot make that same inference just yet (see `mypy #5206 `_). While the above works, the use of ``cast`` is **strongly discouraged**. Where applicable a refactor of the code to appease static analysis is preferable .. code-block:: python