Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use more, cheaper dtype checking utilities in cudf Python #18139

Open
wants to merge 1 commit into
base: branch-25.04
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 0 additions & 13 deletions python/cudf/cudf/api/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,19 +73,6 @@ def is_numeric_dtype(obj):
return pd_types.is_numeric_dtype(obj)


# A version of numerical type check that does not include cudf decimals for
# places where we need to distinguish fixed and floating point numbers.
def _is_non_decimal_numeric_dtype(obj):
if isinstance(obj, _BaseDtype) or isinstance(
getattr(obj, "dtype", None), _BaseDtype
):
return False
try:
return pd_types.is_numeric_dtype(obj)
except TypeError:
return False


def is_integer(obj):
"""Return True if given object is integer.

Expand Down
12 changes: 8 additions & 4 deletions python/cudf/cudf/core/_internals/where.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@
import numpy as np

import cudf
from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
from cudf.api.types import is_scalar
from cudf.core.dtypes import CategoricalDtype
from cudf.utils.dtypes import find_common_type, is_mixed_with_object_dtype
from cudf.utils.dtypes import (
find_common_type,
is_dtype_obj_numeric,
is_mixed_with_object_dtype,
)

if TYPE_CHECKING:
from cudf._typing import DtypeObj, ScalarLike
Expand All @@ -18,7 +22,7 @@

def _normalize_categorical(input_col, other):
if isinstance(input_col, cudf.core.column.CategoricalColumn):
if cudf.api.types.is_scalar(other):
if is_scalar(other):
try:
other = input_col._encode(other)
except ValueError:
Expand Down Expand Up @@ -81,7 +85,7 @@ def _check_and_cast_columns_with_other(
)
return _normalize_categorical(source_col, other.astype(source_dtype))

if _is_non_decimal_numeric_dtype(source_dtype) and as_column(
if is_dtype_obj_numeric(source_dtype, include_decimal=False) and as_column(
other
).can_cast_safely(source_dtype):
common_dtype = source_dtype
Expand Down
11 changes: 5 additions & 6 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import pylibcudf as plc

import cudf
from cudf.api.types import is_scalar
from cudf.core.column import column
from cudf.core.column.methods import ColumnMethods
from cudf.core.dtypes import CategoricalDtype, IntervalDtype
Expand Down Expand Up @@ -622,12 +623,10 @@ def ordered(self) -> bool:
return self.dtype.ordered

def __setitem__(self, key, value):
if cudf.api.types.is_scalar(
value
) and cudf.utils.utils._is_null_host_scalar(value):
if is_scalar(value) and cudf.utils.utils._is_null_host_scalar(value):
to_add_categories = 0
else:
if cudf.api.types.is_scalar(value):
if is_scalar(value):
arr = column.as_column(value, length=1, nan_as_null=False)
else:
arr = column.as_column(value, nan_as_null=False)
Expand All @@ -643,7 +642,7 @@ def __setitem__(self, key, value):
"category, set the categories first"
)

if cudf.api.types.is_scalar(value):
if is_scalar(value):
value = self._encode(value) if value is not None else value
else:
value = cudf.core.column.as_column(value).astype(self.dtype)
Expand Down Expand Up @@ -1044,7 +1043,7 @@ def _validate_fillna_value(
self, fill_value: ScalarLike | ColumnLike
) -> cudf.Scalar | ColumnBase:
"""Align fill_value for .fillna based on column type."""
if cudf.api.types.is_scalar(fill_value):
if is_scalar(fill_value):
if fill_value != _DEFAULT_CATEGORICAL_VALUE:
try:
fill_value = self._encode(fill_value)
Expand Down
16 changes: 9 additions & 7 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,10 @@

import cudf
from cudf.api.types import (
_is_non_decimal_numeric_dtype,
_is_pandas_nullable_extension_dtype,
infer_dtype,
is_decimal_dtype,
is_dtype_equal,
is_scalar,
is_string_dtype,
)
from cudf.core._compat import PANDAS_GE_210
from cudf.core._internals import (
Expand Down Expand Up @@ -69,6 +66,7 @@
find_common_type,
get_time_unit,
is_column_like,
is_dtype_obj_numeric,
is_mixed_with_object_dtype,
min_signed_type,
min_unsigned_type,
Expand Down Expand Up @@ -858,7 +856,7 @@ def _fill(
if end <= begin or begin >= self.size:
return self if inplace else self.copy()

if not inplace or is_string_dtype(self.dtype):
if not inplace or self.dtype == CUDF_STRING_DTYPE:
with acquire_spill_lock():
result = type(self).from_pylibcudf(
plc.filling.fill(
Expand All @@ -868,7 +866,7 @@ def _fill(
fill_value,
)
)
if is_string_dtype(self.dtype):
if self.dtype == CUDF_STRING_DTYPE:
return self._mimic_inplace(result, inplace=True)
return result # type: ignore[return-value]

Expand Down Expand Up @@ -1592,7 +1590,10 @@ def cast(self, dtype: Dtype) -> ColumnBase:
self.to_pylibcudf(mode="read"), dtype_to_pylibcudf_type(dtype)
)
)
if is_decimal_dtype(result.dtype):
if isinstance(
result.dtype,
(cudf.Decimal128Dtype, cudf.Decimal64Dtype, cudf.Decimal32Dtype),
):
result.dtype.precision = dtype.precision # type: ignore[union-attr]
return result

Expand Down Expand Up @@ -2986,7 +2987,8 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
# Notice, we can always cast pure null columns
not_null_col_dtypes = [o.dtype for o in objs if o.null_count != len(o)]
if len(not_null_col_dtypes) and all(
_is_non_decimal_numeric_dtype(dtype) and dtype.kind == "M"
is_dtype_obj_numeric(dtype, include_decimal=False)
and dtype.kind == "M"
for dtype in not_null_col_dtypes
):
common_dtype = find_common_type(not_null_col_dtypes)
Expand Down
8 changes: 4 additions & 4 deletions python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,15 @@

import cudf
import cudf.core.column.column as column
from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
from cudf.api.types import is_scalar
from cudf.core.buffer import acquire_spill_lock
from cudf.core.column.column import ColumnBase, as_column
from cudf.core.column.methods import ColumnMethods, ParentType
from cudf.core.column.numerical import NumericalColumn
from cudf.core.dtypes import ListDtype
from cudf.core.missing import NA
from cudf.core.scalar import pa_scalar_to_plc_scalar
from cudf.utils.dtypes import SIZE_TYPE_DTYPE
from cudf.utils.dtypes import SIZE_TYPE_DTYPE, is_dtype_obj_numeric

if TYPE_CHECKING:
from collections.abc import Sequence
Expand Down Expand Up @@ -718,8 +718,8 @@ def take(self, lists_indices: ColumnLike) -> ParentType:
"lists_indices and list column is of different size."
)
if (
not _is_non_decimal_numeric_dtype(
lists_indices_col.children[1].dtype
not is_dtype_obj_numeric(
lists_indices_col.children[1].dtype, include_decimal=False
)
or lists_indices_col.children[1].dtype.kind not in "iu"
):
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

import cudf
import cudf.core.column.column as column
from cudf.api.types import is_integer, is_scalar
from cudf.api.types import infer_dtype, is_integer, is_scalar
from cudf.core._internals import binaryop
from cudf.core.buffer import acquire_spill_lock, as_buffer
from cudf.core.column.column import ColumnBase, as_column
Expand Down Expand Up @@ -439,7 +439,7 @@ def _process_values_for_isin(
except (MixedTypeError, TypeError) as e:
# There is a corner where `values` can be of `object` dtype
# but have values of homogeneous type.
inferred_dtype = cudf.api.types.infer_dtype(values)
inferred_dtype = infer_dtype(values)
if (
self.dtype.kind in {"i", "u"} and inferred_dtype == "integer"
) or (
Expand Down
5 changes: 2 additions & 3 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,9 @@
import pylibcudf as plc

import cudf
import cudf.api.types
import cudf.core.column.column as column
import cudf.core.column.datetime as datetime
from cudf.api.types import is_integer, is_scalar, is_string_dtype
from cudf.api.types import is_integer, is_scalar
from cudf.core._internals import binaryop
from cudf.core.buffer import Buffer, acquire_spill_lock
from cudf.core.column.column import ColumnBase
Expand Down Expand Up @@ -76,7 +75,7 @@ def __init__(self, parent):
if isinstance(parent.dtype, cudf.ListDtype)
else parent.dtype
)
if not is_string_dtype(value_type):
if value_type != CUDF_STRING_DTYPE:
raise AttributeError(
"Can only use .str accessor with string values"
)
Expand Down
40 changes: 20 additions & 20 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,7 @@
is_dict_like,
is_dtype_equal,
is_list_like,
is_numeric_dtype,
is_object_dtype,
is_scalar,
is_string_dtype,
)
from cudf.core import column, indexing_utils, reshape
from cudf.core._compat import PANDAS_LT_300
Expand Down Expand Up @@ -91,6 +88,7 @@
cudf_dtype_from_pydata_dtype,
find_common_type,
is_column_like,
is_dtype_obj_numeric,
min_signed_type,
)
from cudf.utils.performance_tracking import _performance_tracking
Expand Down Expand Up @@ -146,7 +144,7 @@ def __setitem__(self, key, value):
return self._setitem_tuple_arg(key, value)

@_performance_tracking
def _can_downcast_to_series(self, df, arg):
def _can_downcast_to_series(self, df: DataFrame, arg):
"""
This method encapsulates the logic used
to determine whether or not the result of a loc/iloc
Expand All @@ -171,8 +169,8 @@ def _can_downcast_to_series(self, df, arg):
arg[1], slice
):
return True
dtypes = df.dtypes.values.tolist()
all_numeric = all(is_numeric_dtype(t) for t in dtypes)
dtypes = [dtype for _, dtype in df._dtypes]
all_numeric = all(is_dtype_obj_numeric(t) for t in dtypes)
if all_numeric or (
len(dtypes) and all(t == dtypes[0] for t in dtypes)
):
Expand Down Expand Up @@ -349,7 +347,7 @@ def _getitem_tuple_arg(self, arg):
df.index.name = columns_df.index.name
if not isinstance(
df.index, MultiIndex
) and is_numeric_dtype(df.index.dtype):
) and is_dtype_obj_numeric(df.index.dtype):
# Preserve the original index type.
df.index = df.index.astype(self._frame.index.dtype)
df = df.sort_values(by=[tmp_col_name, cantor_name])
Expand Down Expand Up @@ -3144,7 +3142,7 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None):
# If other was provided, process that next.
if isinstance(other, DataFrame):
other_cols = [other._data[col] for col in self._column_names]
elif cudf.api.types.is_scalar(other):
elif is_scalar(other):
other_cols = [other] * len(self._column_names)
elif isinstance(other, cudf.Series):
other_cols = other.to_pandas()
Expand Down Expand Up @@ -3774,14 +3772,14 @@ def agg(self, aggs, axis=None):
* Not supporting: ``axis``, ``*args``, ``**kwargs``

"""
dtypes = [self[col].dtype for col in self._column_names]
dtypes = [dtype for _, dtype in self._dtypes]
common_dtype = find_common_type(dtypes)
if common_dtype.kind != "b" and any(
dtype.kind == "b" for dtype in dtypes
):
raise MixedTypeError("Cannot create a column with mixed types")

if any(is_string_dtype(dt) for dt in dtypes):
if any(dt == CUDF_STRING_DTYPE for dt in dtypes):
raise NotImplementedError(
"DataFrame.agg() is not supported for "
"frames containing string columns"
Expand Down Expand Up @@ -4920,7 +4918,7 @@ def apply_rows(
"""
for col in incols:
current_col_dtype = self._data[col].dtype
if is_string_dtype(current_col_dtype) or isinstance(
if current_col_dtype == CUDF_STRING_DTYPE or isinstance(
current_col_dtype, cudf.CategoricalDtype
):
raise TypeError(
Expand Down Expand Up @@ -6280,8 +6278,8 @@ def make_false_column_like_self():
else:
# These checks must happen after the conversions above
# since numpy can't handle categorical dtypes.
self_is_str = is_string_dtype(self_col.dtype)
other_is_str = is_string_dtype(other_col.dtype)
self_is_str = self_col.dtype == CUDF_STRING_DTYPE
other_is_str = other_col.dtype == CUDF_STRING_DTYPE

if self_is_str != other_is_str:
# Strings can't compare to anything else.
Expand Down Expand Up @@ -6338,8 +6336,8 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only):
common_dtype = find_common_type(filtered.dtypes)
if (
not numeric_only
and is_string_dtype(common_dtype)
and any(not is_string_dtype(dt) for dt in filtered.dtypes)
and common_dtype == CUDF_STRING_DTYPE
and any(dtype != CUDF_STRING_DTYPE for dtype in filtered._dtypes)
):
raise TypeError(
f"Cannot perform row-wise {method} across mixed-dtype columns,"
Expand Down Expand Up @@ -6462,7 +6460,9 @@ def _reduce(

if numeric_only:
numeric_cols = (
name for name, dtype in self._dtypes if is_numeric_dtype(dtype)
name
for name, dtype in self._dtypes
if is_dtype_obj_numeric(dtype)
)
source = self._get_columns_by_label(numeric_cols)
if source.empty:
Expand Down Expand Up @@ -6493,7 +6493,7 @@ def _reduce(
raise NotImplementedError(
f"Column {col_label} with type {col.dtype} does not support {op}"
) from err
elif not is_numeric_dtype(col.dtype):
elif not is_dtype_obj_numeric(col.dtype):
raise TypeError(
"Non numeric columns passed with "
"`numeric_only=False`, pass `numeric_only=True` "
Expand All @@ -6509,9 +6509,9 @@ def _reduce(
source_dtypes = [dtype for _, dtype in source._dtypes]
common_dtype = find_common_type(source_dtypes)
if (
is_object_dtype(common_dtype)
common_dtype == CUDF_STRING_DTYPE
and any(
not is_object_dtype(dtype) for dtype in source_dtypes
dtype != CUDF_STRING_DTYPE for dtype in source_dtypes
)
or common_dtype.kind != "b"
and any(dtype.kind == "b" for dtype in source_dtypes)
Expand Down Expand Up @@ -8589,7 +8589,7 @@ def _find_common_dtypes_and_categories(
# default to the first non-null dtype
dtypes[idx] = cols[0].dtype
# If all the non-null dtypes are int/float, find a common dtype
if all(is_numeric_dtype(col.dtype) for col in cols):
if all(is_dtype_obj_numeric(col.dtype) for col in cols):
dtypes[idx] = find_common_type([col.dtype for col in cols])
# If all categorical dtypes, combine the categories
elif all(
Expand Down
Loading
Loading