From 2d21f70df60c5aaca1d34d3d43ab8ee64ddf5845 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 28 Feb 2025 15:38:48 -0800 Subject: [PATCH] Use more, cheaper dtype checking utilities in cudf Python --- python/cudf/cudf/api/types.py | 13 ------- python/cudf/cudf/core/_internals/where.py | 12 ++++-- python/cudf/cudf/core/column/categorical.py | 11 +++--- python/cudf/cudf/core/column/column.py | 16 ++++---- python/cudf/cudf/core/column/lists.py | 8 ++-- python/cudf/cudf/core/column/numerical.py | 4 +- python/cudf/cudf/core/column/string.py | 5 +-- python/cudf/cudf/core/dataframe.py | 40 ++++++++++---------- python/cudf/cudf/core/groupby/groupby.py | 21 +++++----- python/cudf/cudf/core/index.py | 14 ++++--- python/cudf/cudf/core/indexed_frame.py | 28 +++++++------- python/cudf/cudf/core/join/_join_helpers.py | 18 ++++++--- python/cudf/cudf/core/multiindex.py | 22 +++++------ python/cudf/cudf/core/reshape.py | 4 +- python/cudf/cudf/core/scalar.py | 4 +- python/cudf/cudf/core/series.py | 6 ++- python/cudf/cudf/core/single_column_frame.py | 8 ++-- python/cudf/cudf/core/tools/datetimes.py | 2 +- python/cudf/cudf/core/tools/numeric.py | 13 ++++--- python/cudf/cudf/core/window/ewm.py | 4 +- python/cudf/cudf/io/dlpack.py | 7 ++-- python/cudf/cudf/testing/testing.py | 10 ++--- python/cudf/cudf/utils/dtypes.py | 14 +++++++ python/cudf/cudf/utils/utils.py | 1 - 24 files changed, 155 insertions(+), 130 deletions(-) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 37ef83c8820..8d7d64ab31e 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -73,19 +73,6 @@ def is_numeric_dtype(obj): return pd_types.is_numeric_dtype(obj) -# A version of numerical type check that does not include cudf decimals for -# places where we need to distinguish fixed and floating point numbers. -def _is_non_decimal_numeric_dtype(obj): - if isinstance(obj, _BaseDtype) or isinstance( - getattr(obj, "dtype", None), _BaseDtype - ): - return False - try: - return pd_types.is_numeric_dtype(obj) - except TypeError: - return False - - def is_integer(obj): """Return True if given object is integer. diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 73011d6ffe0..cf49dfb2194 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -7,9 +7,13 @@ import numpy as np import cudf -from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar +from cudf.api.types import is_scalar from cudf.core.dtypes import CategoricalDtype -from cudf.utils.dtypes import find_common_type, is_mixed_with_object_dtype +from cudf.utils.dtypes import ( + find_common_type, + is_dtype_obj_numeric, + is_mixed_with_object_dtype, +) if TYPE_CHECKING: from cudf._typing import DtypeObj, ScalarLike @@ -18,7 +22,7 @@ def _normalize_categorical(input_col, other): if isinstance(input_col, cudf.core.column.CategoricalColumn): - if cudf.api.types.is_scalar(other): + if is_scalar(other): try: other = input_col._encode(other) except ValueError: @@ -81,7 +85,7 @@ def _check_and_cast_columns_with_other( ) return _normalize_categorical(source_col, other.astype(source_dtype)) - if _is_non_decimal_numeric_dtype(source_dtype) and as_column( + if is_dtype_obj_numeric(source_dtype, include_decimal=False) and as_column( other ).can_cast_safely(source_dtype): common_dtype = source_dtype diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index d41e448254c..69b1e736c0a 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -14,6 +14,7 @@ import pylibcudf as plc import cudf +from cudf.api.types import is_scalar from cudf.core.column import column from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import CategoricalDtype, IntervalDtype @@ -622,12 +623,10 @@ def ordered(self) -> bool: return self.dtype.ordered def __setitem__(self, key, value): - if cudf.api.types.is_scalar( - value - ) and cudf.utils.utils._is_null_host_scalar(value): + if is_scalar(value) and cudf.utils.utils._is_null_host_scalar(value): to_add_categories = 0 else: - if cudf.api.types.is_scalar(value): + if is_scalar(value): arr = column.as_column(value, length=1, nan_as_null=False) else: arr = column.as_column(value, nan_as_null=False) @@ -643,7 +642,7 @@ def __setitem__(self, key, value): "category, set the categories first" ) - if cudf.api.types.is_scalar(value): + if is_scalar(value): value = self._encode(value) if value is not None else value else: value = cudf.core.column.as_column(value).astype(self.dtype) @@ -1044,7 +1043,7 @@ def _validate_fillna_value( self, fill_value: ScalarLike | ColumnLike ) -> cudf.Scalar | ColumnBase: """Align fill_value for .fillna based on column type.""" - if cudf.api.types.is_scalar(fill_value): + if is_scalar(fill_value): if fill_value != _DEFAULT_CATEGORICAL_VALUE: try: fill_value = self._encode(fill_value) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 61f4f7d52fb..3e54712b270 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -23,13 +23,10 @@ import cudf from cudf.api.types import ( - _is_non_decimal_numeric_dtype, _is_pandas_nullable_extension_dtype, infer_dtype, - is_decimal_dtype, is_dtype_equal, is_scalar, - is_string_dtype, ) from cudf.core._compat import PANDAS_GE_210 from cudf.core._internals import ( @@ -69,6 +66,7 @@ find_common_type, get_time_unit, is_column_like, + is_dtype_obj_numeric, is_mixed_with_object_dtype, min_signed_type, min_unsigned_type, @@ -858,7 +856,7 @@ def _fill( if end <= begin or begin >= self.size: return self if inplace else self.copy() - if not inplace or is_string_dtype(self.dtype): + if not inplace or self.dtype == CUDF_STRING_DTYPE: with acquire_spill_lock(): result = type(self).from_pylibcudf( plc.filling.fill( @@ -868,7 +866,7 @@ def _fill( fill_value, ) ) - if is_string_dtype(self.dtype): + if self.dtype == CUDF_STRING_DTYPE: return self._mimic_inplace(result, inplace=True) return result # type: ignore[return-value] @@ -1592,7 +1590,10 @@ def cast(self, dtype: Dtype) -> ColumnBase: self.to_pylibcudf(mode="read"), dtype_to_pylibcudf_type(dtype) ) ) - if is_decimal_dtype(result.dtype): + if isinstance( + result.dtype, + (cudf.Decimal128Dtype, cudf.Decimal64Dtype, cudf.Decimal32Dtype), + ): result.dtype.precision = dtype.precision # type: ignore[union-attr] return result @@ -2986,7 +2987,8 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: # Notice, we can always cast pure null columns not_null_col_dtypes = [o.dtype for o in objs if o.null_count != len(o)] if len(not_null_col_dtypes) and all( - _is_non_decimal_numeric_dtype(dtype) and dtype.kind == "M" + is_dtype_obj_numeric(dtype, include_decimal=False) + and dtype.kind == "M" for dtype in not_null_col_dtypes ): common_dtype = find_common_type(not_null_col_dtypes) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 837763ee30c..ca29f83225b 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -14,7 +14,7 @@ import cudf import cudf.core.column.column as column -from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar +from cudf.api.types import is_scalar from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase, as_column from cudf.core.column.methods import ColumnMethods, ParentType @@ -22,7 +22,7 @@ from cudf.core.dtypes import ListDtype from cudf.core.missing import NA from cudf.core.scalar import pa_scalar_to_plc_scalar -from cudf.utils.dtypes import SIZE_TYPE_DTYPE +from cudf.utils.dtypes import SIZE_TYPE_DTYPE, is_dtype_obj_numeric if TYPE_CHECKING: from collections.abc import Sequence @@ -718,8 +718,8 @@ def take(self, lists_indices: ColumnLike) -> ParentType: "lists_indices and list column is of different size." ) if ( - not _is_non_decimal_numeric_dtype( - lists_indices_col.children[1].dtype + not is_dtype_obj_numeric( + lists_indices_col.children[1].dtype, include_decimal=False ) or lists_indices_col.children[1].dtype.kind not in "iu" ): diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index eecb294acee..9d37292e412 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -14,7 +14,7 @@ import cudf import cudf.core.column.column as column -from cudf.api.types import is_integer, is_scalar +from cudf.api.types import infer_dtype, is_integer, is_scalar from cudf.core._internals import binaryop from cudf.core.buffer import acquire_spill_lock, as_buffer from cudf.core.column.column import ColumnBase, as_column @@ -439,7 +439,7 @@ def _process_values_for_isin( except (MixedTypeError, TypeError) as e: # There is a corner where `values` can be of `object` dtype # but have values of homogeneous type. - inferred_dtype = cudf.api.types.infer_dtype(values) + inferred_dtype = infer_dtype(values) if ( self.dtype.kind in {"i", "u"} and inferred_dtype == "integer" ) or ( diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index b82ec1958fb..9f3512369a0 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -16,10 +16,9 @@ import pylibcudf as plc import cudf -import cudf.api.types import cudf.core.column.column as column import cudf.core.column.datetime as datetime -from cudf.api.types import is_integer, is_scalar, is_string_dtype +from cudf.api.types import is_integer, is_scalar from cudf.core._internals import binaryop from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column.column import ColumnBase @@ -76,7 +75,7 @@ def __init__(self, parent): if isinstance(parent.dtype, cudf.ListDtype) else parent.dtype ) - if not is_string_dtype(value_type): + if value_type != CUDF_STRING_DTYPE: raise AttributeError( "Can only use .str accessor with string values" ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 69db055fe87..3770aef872a 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -42,10 +42,7 @@ is_dict_like, is_dtype_equal, is_list_like, - is_numeric_dtype, - is_object_dtype, is_scalar, - is_string_dtype, ) from cudf.core import column, indexing_utils, reshape from cudf.core._compat import PANDAS_LT_300 @@ -91,6 +88,7 @@ cudf_dtype_from_pydata_dtype, find_common_type, is_column_like, + is_dtype_obj_numeric, min_signed_type, ) from cudf.utils.performance_tracking import _performance_tracking @@ -146,7 +144,7 @@ def __setitem__(self, key, value): return self._setitem_tuple_arg(key, value) @_performance_tracking - def _can_downcast_to_series(self, df, arg): + def _can_downcast_to_series(self, df: DataFrame, arg): """ This method encapsulates the logic used to determine whether or not the result of a loc/iloc @@ -171,8 +169,8 @@ def _can_downcast_to_series(self, df, arg): arg[1], slice ): return True - dtypes = df.dtypes.values.tolist() - all_numeric = all(is_numeric_dtype(t) for t in dtypes) + dtypes = [dtype for _, dtype in df._dtypes] + all_numeric = all(is_dtype_obj_numeric(t) for t in dtypes) if all_numeric or ( len(dtypes) and all(t == dtypes[0] for t in dtypes) ): @@ -349,7 +347,7 @@ def _getitem_tuple_arg(self, arg): df.index.name = columns_df.index.name if not isinstance( df.index, MultiIndex - ) and is_numeric_dtype(df.index.dtype): + ) and is_dtype_obj_numeric(df.index.dtype): # Preserve the original index type. df.index = df.index.astype(self._frame.index.dtype) df = df.sort_values(by=[tmp_col_name, cantor_name]) @@ -3144,7 +3142,7 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None): # If other was provided, process that next. if isinstance(other, DataFrame): other_cols = [other._data[col] for col in self._column_names] - elif cudf.api.types.is_scalar(other): + elif is_scalar(other): other_cols = [other] * len(self._column_names) elif isinstance(other, cudf.Series): other_cols = other.to_pandas() @@ -3774,14 +3772,14 @@ def agg(self, aggs, axis=None): * Not supporting: ``axis``, ``*args``, ``**kwargs`` """ - dtypes = [self[col].dtype for col in self._column_names] + dtypes = [dtype for _, dtype in self._dtypes] common_dtype = find_common_type(dtypes) if common_dtype.kind != "b" and any( dtype.kind == "b" for dtype in dtypes ): raise MixedTypeError("Cannot create a column with mixed types") - if any(is_string_dtype(dt) for dt in dtypes): + if any(dt == CUDF_STRING_DTYPE for dt in dtypes): raise NotImplementedError( "DataFrame.agg() is not supported for " "frames containing string columns" @@ -4920,7 +4918,7 @@ def apply_rows( """ for col in incols: current_col_dtype = self._data[col].dtype - if is_string_dtype(current_col_dtype) or isinstance( + if current_col_dtype == CUDF_STRING_DTYPE or isinstance( current_col_dtype, cudf.CategoricalDtype ): raise TypeError( @@ -6280,8 +6278,8 @@ def make_false_column_like_self(): else: # These checks must happen after the conversions above # since numpy can't handle categorical dtypes. - self_is_str = is_string_dtype(self_col.dtype) - other_is_str = is_string_dtype(other_col.dtype) + self_is_str = self_col.dtype == CUDF_STRING_DTYPE + other_is_str = other_col.dtype == CUDF_STRING_DTYPE if self_is_str != other_is_str: # Strings can't compare to anything else. @@ -6338,8 +6336,8 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only): common_dtype = find_common_type(filtered.dtypes) if ( not numeric_only - and is_string_dtype(common_dtype) - and any(not is_string_dtype(dt) for dt in filtered.dtypes) + and common_dtype == CUDF_STRING_DTYPE + and any(dtype != CUDF_STRING_DTYPE for dtype in filtered._dtypes) ): raise TypeError( f"Cannot perform row-wise {method} across mixed-dtype columns," @@ -6462,7 +6460,9 @@ def _reduce( if numeric_only: numeric_cols = ( - name for name, dtype in self._dtypes if is_numeric_dtype(dtype) + name + for name, dtype in self._dtypes + if is_dtype_obj_numeric(dtype) ) source = self._get_columns_by_label(numeric_cols) if source.empty: @@ -6493,7 +6493,7 @@ def _reduce( raise NotImplementedError( f"Column {col_label} with type {col.dtype} does not support {op}" ) from err - elif not is_numeric_dtype(col.dtype): + elif not is_dtype_obj_numeric(col.dtype): raise TypeError( "Non numeric columns passed with " "`numeric_only=False`, pass `numeric_only=True` " @@ -6509,9 +6509,9 @@ def _reduce( source_dtypes = [dtype for _, dtype in source._dtypes] common_dtype = find_common_type(source_dtypes) if ( - is_object_dtype(common_dtype) + common_dtype == CUDF_STRING_DTYPE and any( - not is_object_dtype(dtype) for dtype in source_dtypes + dtype != CUDF_STRING_DTYPE for dtype in source_dtypes ) or common_dtype.kind != "b" and any(dtype.kind == "b" for dtype in source_dtypes) @@ -8589,7 +8589,7 @@ def _find_common_dtypes_and_categories( # default to the first non-null dtype dtypes[idx] = cols[0].dtype # If all the non-null dtypes are int/float, find a common dtype - if all(is_numeric_dtype(col.dtype) for col in cols): + if all(is_dtype_obj_numeric(col.dtype) for col in cols): dtypes[idx] = find_common_type([col.dtype for col in cols]) # If all categorical dtypes, combine the categories elif all( diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 38b519c6d5f..df11ebd4f94 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -20,11 +20,7 @@ import cudf from cudf.api.extensions import no_default -from cudf.api.types import ( - is_list_like, - is_numeric_dtype, - is_string_dtype, -) +from cudf.api.types import is_list_like, is_scalar from cudf.core._compat import PANDAS_LT_300 from cudf.core._internals import aggregation, sorting, stream_compaction from cudf.core.abc import Serializable @@ -44,7 +40,12 @@ from cudf.core.multiindex import MultiIndex from cudf.core.scalar import pa_scalar_to_plc_scalar from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply -from cudf.utils.dtypes import SIZE_TYPE_DTYPE, cudf_dtype_to_pa_type +from cudf.utils.dtypes import ( + CUDF_STRING_DTYPE, + SIZE_TYPE_DTYPE, + cudf_dtype_to_pa_type, + is_dtype_obj_numeric, +) from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import GetAttrGetItemMixin @@ -91,7 +92,7 @@ @singledispatch def get_valid_aggregation(dtype): - if is_string_dtype(dtype): + if dtype == CUDF_STRING_DTYPE: return _STRING_AGGS return "ALL" @@ -1788,7 +1789,7 @@ def _post_process_chunk_results( ): if not len(chunk_results): return self.obj.head(0) - if isinstance(chunk_results, ColumnBase) or cudf.api.types.is_scalar( + if isinstance(chunk_results, ColumnBase) or is_scalar( chunk_results[0] ): data = ColumnAccessor( @@ -3077,7 +3078,9 @@ def _reduce_numeric_only(self, op: str): columns = list( name for name, dtype in self.obj._dtypes - if (is_numeric_dtype(dtype) and name not in self.grouping.names) + if ( + is_dtype_obj_numeric(dtype) and name not in self.grouping.names + ) ) return self[columns].agg(op) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index f4e5f6e96ae..05a2a46c051 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -20,12 +20,11 @@ import cudf from cudf.api.extensions import no_default from cudf.api.types import ( - _is_non_decimal_numeric_dtype, is_dtype_equal, + is_hashable, is_integer, is_list_like, is_scalar, - is_string_dtype, ) from cudf.core._base_index import BaseIndex, _return_get_indexer_result from cudf.core._compat import PANDAS_LT_300 @@ -57,6 +56,7 @@ cudf_dtype_from_pa_type, cudf_dtype_to_pa_type, find_common_type, + is_dtype_obj_numeric, is_mixed_with_object_dtype, ) from cudf.utils.performance_tracking import _performance_tracking @@ -232,7 +232,7 @@ class RangeIndex(BaseIndex, BinaryOperand): def __init__( self, start, stop=None, step=1, dtype=None, copy=False, name=None ): - if not cudf.api.types.is_hashable(name): + if not is_hashable(name): raise ValueError("Name must be a hashable value.") self._name = name if dtype is not None and cudf.dtype(dtype).kind != "i": @@ -1786,7 +1786,7 @@ def isin(self, values, level=None) -> cupy.ndarray: @property @_performance_tracking def str(self): - if is_string_dtype(self.dtype): + if self.dtype == CUDF_STRING_DTYPE: return StringMethods(parent=self) else: raise AttributeError( @@ -3366,7 +3366,7 @@ def interval_range( "freq, exactly three must be specified" ) - if periods is not None and not cudf.api.types.is_integer(periods): + if periods is not None and not is_integer(periods): warnings.warn( "Non-integer 'periods' in cudf.date_range, and cudf.interval_range" " are deprecated and will raise in a future version.", @@ -3390,7 +3390,9 @@ def interval_range( pa_freq = pa.scalar(freq) if any( - not _is_non_decimal_numeric_dtype(cudf_dtype_from_pa_type(x.type)) + not is_dtype_obj_numeric( + cudf_dtype_from_pa_type(x.type), include_decimal=False + ) for x in (pa_start, pa.scalar(periods), pa_freq, pa_end) ): raise ValueError("start, end, periods, freq must be numeric values.") diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 9d426ad6bf7..8925f113309 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -30,7 +30,6 @@ import cudf.core.algorithms from cudf.api.extensions import no_default from cudf.api.types import ( - _is_non_decimal_numeric_dtype, is_dict_like, is_list_like, is_scalar, @@ -60,7 +59,11 @@ from cudf.utils import docutils, ioutils from cudf.utils._numba import _CUDFNumbaConfig from cudf.utils.docutils import copy_docstring -from cudf.utils.dtypes import SIZE_TYPE_DTYPE +from cudf.utils.dtypes import ( + SIZE_TYPE_DTYPE, + is_column_like, + is_dtype_obj_numeric, +) from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import _warn_no_dask_cudf @@ -71,6 +74,7 @@ ColumnLike, DataFrameOrSeries, Dtype, + DtypeObj, NotImplementedType, ) @@ -6406,9 +6410,9 @@ def rank( dropped_cols = False source = self if numeric_only: - if isinstance( - source, cudf.Series - ) and not _is_non_decimal_numeric_dtype(self.dtype): # type: ignore[attr-defined] + if isinstance(source, cudf.Series) and not is_dtype_obj_numeric( + source.dtype, include_decimal=False + ): # type: ignore[attr-defined] raise TypeError( "Series.rank does not allow numeric_only=True with " "non-numeric dtype." @@ -6416,7 +6420,7 @@ def rank( numeric_cols = ( name for name, dtype in self._dtypes - if _is_non_decimal_numeric_dtype(dtype) + if is_dtype_obj_numeric(dtype, include_decimal=False) ) source = self._get_columns_by_label(numeric_cols) if source.empty: @@ -6558,7 +6562,7 @@ def _check_duplicate_level_names(specified, level_names): @_performance_tracking def _get_replacement_values_for_columns( - to_replace: Any, value: Any, columns_dtype_map: dict[Any, Any] + to_replace: Any, value: Any, columns_dtype_map: dict[Any, DtypeObj] ) -> tuple[dict[Any, bool], dict[Any, Any], dict[Any, Any]]: """ Returns a per column mapping for the values to be replaced, new @@ -6591,24 +6595,22 @@ def _get_replacement_values_for_columns( if is_scalar(to_replace) and is_scalar(value): to_replace_columns = {col: [to_replace] for col in columns_dtype_map} values_columns = {col: [value] for col in columns_dtype_map} - elif cudf.api.types.is_list_like(to_replace) or isinstance( + elif is_list_like(to_replace) or isinstance( to_replace, (ColumnBase, BaseIndex) ): if is_scalar(value): to_replace_columns = {col: to_replace for col in columns_dtype_map} values_columns = { col: [value] - if _is_non_decimal_numeric_dtype(columns_dtype_map[col]) + if is_dtype_obj_numeric(dtype, include_decimal=False) else as_column( value, length=len(to_replace), dtype=cudf.dtype(type(value)), ) - for col in columns_dtype_map + for col, dtype in columns_dtype_map.items() } - elif cudf.api.types.is_list_like( - value - ) or cudf.utils.dtypes.is_column_like(value): + elif is_list_like(value) or is_column_like(value): if len(to_replace) != len(value): raise ValueError( f"Replacement lists must be " diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index c329bf11d97..331aa57fca4 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -9,9 +9,15 @@ import numpy as np import cudf -from cudf.api.types import is_decimal_dtype, is_dtype_equal, is_numeric_dtype +from cudf.api.types import is_dtype_equal from cudf.core.column import CategoricalColumn -from cudf.core.dtypes import CategoricalDtype +from cudf.core.dtypes import ( + CategoricalDtype, + Decimal32Dtype, + Decimal64Dtype, + Decimal128Dtype, +) +from cudf.utils.dtypes import is_dtype_obj_numeric if TYPE_CHECKING: from cudf.core.column import ColumnBase @@ -81,15 +87,17 @@ def _match_join_keys( if is_dtype_equal(ltype, rtype): return lcol, rcol - if is_decimal_dtype(ltype) or is_decimal_dtype(rtype): + if isinstance( + ltype, (Decimal32Dtype, Decimal64Dtype, Decimal128Dtype) + ) or isinstance(rtype, (Decimal32Dtype, Decimal64Dtype, Decimal128Dtype)): raise TypeError( "Decimal columns can only be merged with decimal columns " "of the same precision and scale" ) if ( - is_numeric_dtype(ltype) - and is_numeric_dtype(rtype) + is_dtype_obj_numeric(ltype) + and is_dtype_obj_numeric(rtype) and not (ltype.kind == "m" or rtype.kind == "m") ): common_type = ( diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 87a8849a260..f681c043186 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -17,7 +17,7 @@ import cudf from cudf.api.extensions import no_default -from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar +from cudf.api.types import is_integer, is_list_like, is_scalar from cudf.core import column from cudf.core._base_index import _return_get_indexer_result from cudf.core._internals import sorting @@ -33,7 +33,11 @@ ensure_index, ) from cudf.core.join._join_helpers import _match_join_keys -from cudf.utils.dtypes import SIZE_TYPE_DTYPE, is_column_like +from cudf.utils.dtypes import ( + CUDF_STRING_DTYPE, + SIZE_TYPE_DTYPE, + is_column_like, +) from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name @@ -42,7 +46,7 @@ from typing_extensions import Self - from cudf._typing import DataFrameOrSeries + from cudf._typing import DataFrameOrSeries, Dtype def _maybe_indices_to_slice(indices: cp.ndarray) -> slice | cp.ndarray: @@ -233,8 +237,8 @@ def to_series(self, index=None, name=None): ) @_performance_tracking - def astype(self, dtype, copy: bool = True) -> Self: - if not is_object_dtype(dtype): + def astype(self, dtype: Dtype, copy: bool = True) -> Self: + if cudf.dtype(dtype) != CUDF_STRING_DTYPE: raise TypeError( "Setting a MultiIndex dtype to anything other than object is " "not supported" @@ -1699,16 +1703,12 @@ def _is_sorted(self, ascending=None, null_position=None) -> bool: Returns True, if sorted as expected by ``ascending`` and ``null_position``, False otherwise. """ - if ascending is not None and not cudf.api.types.is_list_like( - ascending - ): + if ascending is not None and not is_list_like(ascending): raise TypeError( f"Expected a list-like or None for `ascending`, got " f"{type(ascending)}" ) - if null_position is not None and not cudf.api.types.is_list_like( - null_position - ): + if null_position is not None and not is_list_like(null_position): raise TypeError( f"Expected a list-like or None for `null_position`, got " f"{type(null_position)}" diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 7d76907916f..b7412f2cc85 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -12,7 +12,7 @@ import cudf from cudf.api.extensions import no_default -from cudf.api.types import is_scalar +from cudf.api.types import is_list_like, is_scalar from cudf.core._compat import PANDAS_LT_300 from cudf.core.column import ( ColumnBase, @@ -1362,7 +1362,7 @@ def _one_hot_encode_column( def _length_check_params(obj, columns, name): - if cudf.api.types.is_list_like(obj): + if is_list_like(obj): if len(obj) != len(columns): raise ValueError( f"Length of '{name}' ({len(obj)}) did not match the " diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 29139768a36..8579b7398f0 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -9,7 +9,6 @@ from typing import TYPE_CHECKING, Any import numpy as np -import pandas as pd import pyarrow as pa import pylibcudf as plc @@ -25,6 +24,7 @@ from cudf.core.missing import NA, NaT from cudf.core.mixins import BinaryOperand from cudf.utils.dtypes import ( + CUDF_STRING_DTYPE, cudf_dtype_from_pa_type, get_allowed_combinations_for_operator, to_cudf_compatible_scalar, @@ -191,7 +191,7 @@ def _to_plc_scalar(value: ScalarLike, dtype: Dtype) -> plc.Scalar: if isinstance(dtype, cudf.core.dtypes._BaseDtype): pa_type = dtype.to_arrow() - elif pd.api.types.is_string_dtype(dtype): + elif dtype == CUDF_STRING_DTYPE: # Have to manually convert object types, which we use internally # for strings but pyarrow only supports as unicode 'U' pa_type = pa.string() diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index f6f1b31dc43..7c39db7057d 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -20,7 +20,6 @@ import cudf from cudf.api.extensions import no_default from cudf.api.types import ( - _is_non_decimal_numeric_dtype, _is_scalar_or_zero_d_array, is_dict_like, is_integer, @@ -64,6 +63,7 @@ from cudf.utils.dtypes import ( can_convert_to_column, find_common_type, + is_dtype_obj_numeric, is_mixed_with_object_dtype, to_cudf_compatible_scalar, ) @@ -357,7 +357,9 @@ def _loc_to_iloc(self, arg): "as labels (consistent with DataFrame behavior). To access " "a value by position, use `ser.iloc[pos]`" ) - if not _is_non_decimal_numeric_dtype(index_dtype) and not ( + if not is_dtype_obj_numeric( + index_dtype, include_decimal=False + ) and not ( isinstance(index_dtype, cudf.CategoricalDtype) and index_dtype.categories.dtype.kind in "iu" ): diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index f9713ca62d1..aa59d3af640 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -12,12 +12,12 @@ from cudf.api.types import ( _is_scalar_or_zero_d_array, is_integer, - is_numeric_dtype, + is_scalar, ) from cudf.core.column import ColumnBase, as_column from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import Frame -from cudf.utils.dtypes import SIZE_TYPE_DTYPE +from cudf.utils.dtypes import SIZE_TYPE_DTYPE, is_dtype_obj_numeric from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import NotIterable @@ -54,7 +54,7 @@ def _reduce( if axis not in (None, 0, no_default): raise NotImplementedError("axis parameter is not implemented yet") - if numeric_only and not is_numeric_dtype(self.dtype): + if numeric_only and not is_dtype_obj_numeric(self.dtype): raise TypeError( f"Series.{op} does not allow numeric_only={numeric_only} " "with non-numeric dtypes." @@ -374,7 +374,7 @@ def where(self, cond, other=None, inplace=False): """Array conditional must be same shape as self""" ) - if not cudf.api.types.is_scalar(other): + if not is_scalar(other): other = cudf.core.column.as_column(other) input_col, other = _check_and_cast_columns_with_other( diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 4478be2fd04..89abc120de9 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -882,7 +882,7 @@ def date_range( "three must be specified" ) - if periods is not None and not cudf.api.types.is_integer(periods): + if periods is not None and not is_integer(periods): warnings.warn( "Non-integer 'periods' in cudf.date_range, and cudf.interval_range" " are deprecated and will raise in a future version.", diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 9746234cfb1..18e96ee4a68 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -8,11 +8,14 @@ import pandas as pd import cudf -from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype from cudf.core.column import as_column from cudf.core.dtypes import CategoricalDtype from cudf.core.index import ensure_index -from cudf.utils.dtypes import can_convert_to_column +from cudf.utils.dtypes import ( + CUDF_STRING_DTYPE, + can_convert_to_column, + is_dtype_obj_numeric, +) if TYPE_CHECKING: from cudf.core.column.numerical import NumericalColumn @@ -142,7 +145,7 @@ def to_numeric( return arg else: raise e - elif is_string_dtype(dtype): + elif dtype == CUDF_STRING_DTYPE: try: col = _convert_str_col(col, errors, downcast) # type: ignore[arg-type] except ValueError as e: @@ -152,7 +155,7 @@ def to_numeric( raise e elif isinstance(dtype, (cudf.ListDtype, cudf.StructDtype)): raise ValueError("Input does not support nested datatypes") - elif _is_non_decimal_numeric_dtype(dtype): + elif is_dtype_obj_numeric(dtype, include_decimal=False): pass else: raise ValueError("Unrecognized datatype") @@ -218,7 +221,7 @@ def _convert_str_col( ------- Converted numeric column """ - if not is_string_dtype(col): + if col.dtype != CUDF_STRING_DTYPE: raise TypeError("col must be string dtype.") if col.is_integer().all(): diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py index 3e8a6ab400c..4b94e3e52b1 100644 --- a/python/cudf/cudf/core/window/ewm.py +++ b/python/cudf/cudf/core/window/ewm.py @@ -6,8 +6,8 @@ import numpy as np -from cudf.api.types import is_numeric_dtype from cudf.core.window.rolling import _RollingBase +from cudf.utils.dtypes import is_dtype_obj_numeric if TYPE_CHECKING: from cudf.core.column.column import ColumnBase @@ -184,7 +184,7 @@ def cov( def _apply_agg_column( self, source_column: ColumnBase, agg_name: str ) -> ColumnBase: - if not is_numeric_dtype(source_column.dtype): + if not is_dtype_obj_numeric(source_column.dtype): raise TypeError("No numeric types to aggregate") # libcudf ewm has special casing for nulls only diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py index 3b3fd5f7c56..e7b224a40e7 100644 --- a/python/cudf/cudf/io/dlpack.py +++ b/python/cudf/cudf/io/dlpack.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations import pylibcudf as plc @@ -6,6 +6,7 @@ import cudf from cudf.core.column import ColumnBase from cudf.utils import ioutils +from cudf.utils.dtypes import find_common_type, is_dtype_obj_numeric def from_dlpack(pycapsule_obj) -> cudf.Series | cudf.DataFrame: @@ -83,12 +84,12 @@ def to_dlpack(cudf_obj: cudf.Series | cudf.DataFrame | cudf.BaseIndex): ) if any( - not cudf.api.types._is_non_decimal_numeric_dtype(dtype) + not is_dtype_obj_numeric(dtype, include_decimal=False) for _, dtype in gdf._dtypes # type: ignore[union-attr] ): raise TypeError("non-numeric data not yet supported") - dtype = cudf.utils.dtypes.find_common_type( + dtype = find_common_type( [dtype for _, dtype in gdf._dtypes] # type: ignore[union-attr] ) gdf = gdf.astype(dtype) diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 9c20a42d215..e1b0c17eb00 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -10,15 +10,15 @@ from pandas import testing as tm import cudf -from cudf.api.types import is_numeric_dtype, is_string_dtype from cudf.core.missing import NA, NaT +from cudf.utils.dtypes import CUDF_STRING_DTYPE, is_dtype_obj_numeric def dtype_can_compare_equal_to_other(dtype): # return True if values of this dtype can compare # as equal to equal values of a different dtype return not ( - is_string_dtype(dtype) + dtype == CUDF_STRING_DTYPE or isinstance( dtype, ( @@ -218,10 +218,10 @@ def assert_column_equal( elif not ( ( not dtype_can_compare_equal_to_other(left.dtype) - and is_numeric_dtype(right.dtype) + and is_dtype_obj_numeric(right.dtype) ) or ( - is_numeric_dtype(left.dtype) + is_dtype_obj_numeric(left.dtype) and not dtype_can_compare_equal_to_other(right.dtype) ) ): @@ -234,7 +234,7 @@ def assert_column_equal( if ( columns_equal and not check_exact - and is_numeric_dtype(left.dtype) + and is_dtype_obj_numeric(left.dtype) ): # non-null values must be the same columns_equal = cp.allclose( diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 489b804583a..adee17e7bfb 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -612,6 +612,20 @@ def _get_base_dtype(dtype: pd.DatetimeTZDtype) -> np.dtype: return dtype.base +def is_dtype_obj_numeric( + dtype: DtypeObj, include_decimal: bool = True +) -> bool: + """Like is_numeric_dtype but does not introspect argument.""" + is_non_decimal = dtype.kind in set("iufb") + if include_decimal: + return is_non_decimal or isinstance( + dtype, + (cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype), + ) + else: + return is_non_decimal + + def dtype_to_pylibcudf_type(dtype) -> plc.DataType: if isinstance(dtype, cudf.ListDtype): return plc.DataType(plc.TypeId.LIST) diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 2678a4f8116..601a7a369e8 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -15,7 +15,6 @@ import rmm import cudf -import cudf.api.types from cudf.core import column from cudf.core.buffer import as_buffer from cudf.utils.dtypes import SIZE_TYPE_DTYPE