Skip to content

Commit

Permalink
REF: check monotonicity inside _can_use_libjoin (#55342)
Browse files Browse the repository at this point in the history
* REF: fix can_use_libjoin check

* DOC: docstring for can_use_libjoin

* Make can_use_libjoin checks more-correct

* avoid allocating mapping in monotonic cases

* fix categorical memory usage tests

* catch decimal.InvalidOperation

---------

Co-authored-by: Luke Manley <lukemanley@gmail.com>
  • Loading branch information
jbrockmendel and lukemanley authored Dec 27, 2023
1 parent 44330e8 commit ee8f335
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 17 deletions.
12 changes: 11 additions & 1 deletion pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ from pandas._libs.missing cimport (
is_matching_na,
)

from decimal import InvalidOperation

# Defines shift of MultiIndex codes to avoid negative codes (missing values)
multiindex_nulls_shift = 2

Expand Down Expand Up @@ -248,6 +250,10 @@ cdef class IndexEngine:

@property
def is_unique(self) -> bool:
# for why we check is_monotonic_increasing here, see
# https://github.com/pandas-dev/pandas/pull/55342#discussion_r1361405781
if self.need_monotonic_check:
self.is_monotonic_increasing
if self.need_unique_check:
self._do_unique_check()

Expand Down Expand Up @@ -281,7 +287,7 @@ cdef class IndexEngine:
values = self.values
self.monotonic_inc, self.monotonic_dec, is_strict_monotonic = \
self._call_monotonic(values)
except TypeError:
except (TypeError, InvalidOperation):
self.monotonic_inc = 0
self.monotonic_dec = 0
is_strict_monotonic = 0
Expand Down Expand Up @@ -843,6 +849,10 @@ cdef class SharedEngine:

@property
def is_unique(self) -> bool:
# for why we check is_monotonic_increasing here, see
# https://github.com/pandas-dev/pandas/pull/55342#discussion_r1361405781
if self.need_monotonic_check:
self.is_monotonic_increasing
if self.need_unique_check:
arr = self.values.unique()
self.unique = len(arr) == len(self.values)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3711,7 +3711,7 @@ def memory_usage(self, index: bool = True, deep: bool = False) -> Series:
many repeated values.
>>> df['object'].astype('category').memory_usage(deep=True)
5244
5136
"""
result = self._constructor_sliced(
[c.memory_usage(index=False, deep=deep) for col, c in self.items()],
Expand Down
20 changes: 10 additions & 10 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3382,9 +3382,7 @@ def _union(self, other: Index, sort: bool | None):

if (
sort in (None, True)
and self.is_monotonic_increasing
and other.is_monotonic_increasing
and not (self.has_duplicates and other.has_duplicates)
and (self.is_unique or other.is_unique)
and self._can_use_libjoin
and other._can_use_libjoin
):
Expand Down Expand Up @@ -3536,12 +3534,7 @@ def _intersection(self, other: Index, sort: bool = False):
"""
intersection specialized to the case with matching dtypes.
"""
if (
self.is_monotonic_increasing
and other.is_monotonic_increasing
and self._can_use_libjoin
and other._can_use_libjoin
):
if self._can_use_libjoin and other._can_use_libjoin:
try:
res_indexer, indexer, _ = self._inner_indexer(other)
except TypeError:
Expand Down Expand Up @@ -4980,7 +4973,10 @@ def _get_leaf_sorter(labels: list[np.ndarray]) -> npt.NDArray[np.intp]:
def _join_monotonic(
self, other: Index, how: JoinHow = "left"
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
# We only get here with matching dtypes and both monotonic increasing
# We only get here with (caller is responsible for ensuring):
# 1) matching dtypes
# 2) both monotonic increasing
# 3) other.is_unique or self.is_unique
assert other.dtype == self.dtype
assert self._can_use_libjoin and other._can_use_libjoin

Expand Down Expand Up @@ -5062,6 +5058,10 @@ def _can_use_libjoin(self) -> bool:
making a copy. If we cannot, this negates the performance benefit
of using libjoin.
"""
if not self.is_monotonic_increasing:
# The libjoin functions all assume monotonicity.
return False

if type(self) is Index:
# excludes EAs, but include masks, we get here with monotonic
# values only, meaning no NA
Expand Down
5 changes: 0 additions & 5 deletions pandas/tests/extension/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,6 @@ def data_for_grouping():


class TestCategorical(base.ExtensionTests):
@pytest.mark.xfail(reason="Memory usage doesn't match")
def test_memory_usage(self, data):
# TODO: Is this deliberate?
super().test_memory_usage(data)

def test_contains(self, data, data_missing):
# GH-37867
# na value handling in Categorical.__contains__ is deprecated.
Expand Down

0 comments on commit ee8f335

Please sign in to comment.