From 9e4abcf874e79c0364800fb82cb4e238038eb151 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 28 Feb 2025 17:05:41 -0800 Subject: [PATCH] Prune some seldom used dtype utils --- python/cudf/cudf/core/column/timedelta.py | 23 +----- python/cudf/cudf/core/dataframe.py | 18 ++++- python/cudf/cudf/testing/dataset_generator.py | 10 +-- python/cudf/cudf/tests/test_dtypes.py | 22 ++---- python/cudf/cudf/utils/dtypes.py | 78 ++++--------------- python/cudf/cudf/utils/ioutils.py | 39 +++------- 6 files changed, 51 insertions(+), 139 deletions(-) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index e4d47f492c2..22c288ce9f7 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -9,7 +9,6 @@ import numpy as np import pandas as pd -import pyarrow as pa import pylibcudf as plc @@ -19,7 +18,7 @@ from cudf.core._internals import binaryop from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column.column import ColumnBase -from cudf.utils.dtypes import CUDF_STRING_DTYPE, np_to_pa_dtype +from cudf.utils.dtypes import CUDF_STRING_DTYPE from cudf.utils.utils import ( _all_bools_with_nulls, _datetime_timedelta_find_and_replace, @@ -179,26 +178,6 @@ def to_pandas( pa_array.to_numpy(zero_copy_only=False, writable=True) ) - @acquire_spill_lock() - def to_arrow(self) -> pa.Array: - mask = None - if self.nullable: - mask = pa.py_buffer( - self.mask_array_view(mode="read").copy_to_host() - ) - data = pa.py_buffer( - self.astype(np.dtype(np.int64)) - .data_array_view(mode="read") - .copy_to_host() - ) - pa_dtype = np_to_pa_dtype(self.dtype) - return pa.Array.from_buffers( - type=pa_dtype, - length=len(self), - buffers=[mask, data], - null_count=self.null_count, - ) - def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: reflect, op = self._check_reflected_op(op) other = self._wrap_binop_normalization(other) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 69db055fe87..dfe2b381d92 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -87,8 +87,8 @@ from cudf.utils.dtypes import ( CUDF_STRING_DTYPE, SIZE_TYPE_DTYPE, + SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES, can_convert_to_column, - cudf_dtype_from_pydata_dtype, find_common_type, is_column_like, min_signed_type, @@ -6828,6 +6828,22 @@ def select_dtypes(self, include=None, exclude=None): if not isinstance(exclude, (list, tuple)): exclude = (exclude,) if exclude is not None else () + def cudf_dtype_from_pydata_dtype(dtype): + """Given a numpy or pandas dtype, converts it into the equivalent cuDF + Python dtype. + """ + if cudf.api.types._is_categorical_dtype(dtype): + return cudf.core.dtypes.CategoricalDtype + elif cudf.api.types.is_decimal32_dtype(dtype): + return cudf.core.dtypes.Decimal32Dtype + elif cudf.api.types.is_decimal64_dtype(dtype): + return cudf.core.dtypes.Decimal64Dtype + elif cudf.api.types.is_decimal128_dtype(dtype): + return cudf.core.dtypes.Decimal128Dtype + elif dtype in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES: + return dtype.type + return pd.core.dtypes.common.infer_dtype_from_object(dtype) + # cudf_dtype_from_pydata_dtype can distinguish between # np.float and np.number selection = tuple(map(frozenset, (include, exclude))) diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py index 01a75a2efb0..86244493d4f 100644 --- a/python/cudf/cudf/testing/dataset_generator.py +++ b/python/cudf/cudf/testing/dataset_generator.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. # This module is for generating "synthetic" datasets. It was originally # designed for testing filtered reading. Generally, it should be useful @@ -17,7 +17,7 @@ from pyarrow import parquet as pq import cudf -from cudf.utils.dtypes import np_to_pa_dtype +from cudf.utils.dtypes import cudf_dtype_to_pa_type class ColumnParameters: @@ -137,7 +137,7 @@ def _generate_column(column_params, num_rows, rng): if hasattr(column_params.dtype, "to_arrow"): arrow_type = column_params.dtype.to_arrow() elif column_params.dtype is not None: - arrow_type = np_to_pa_dtype(cudf.dtype(column_params.dtype)) + arrow_type = cudf_dtype_to_pa_type(cudf.dtype(column_params.dtype)) else: arrow_type = None @@ -254,14 +254,14 @@ def get_dataframe(parameters, use_threads): ): arrow_type = pa.dictionary( index_type=pa.int64(), - value_type=np_to_pa_dtype( + value_type=cudf_dtype_to_pa_type( cudf.dtype(type(next(iter(column_params.generator)))) ), ) elif hasattr(column_params.dtype, "to_arrow"): arrow_type = column_params.dtype.to_arrow() else: - arrow_type = np_to_pa_dtype( + arrow_type = cudf_dtype_to_pa_type( cudf.dtype(type(next(iter(column_params.generator)))) if column_params.dtype is None else column_params.dtype diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index c62b5889fdd..103a286b892 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -18,7 +18,7 @@ StructDtype, ) from cudf.testing import assert_eq -from cudf.utils.dtypes import np_to_pa_dtype +from cudf.utils.dtypes import cudf_dtype_to_pa_type def test_cdt_basic(): @@ -67,7 +67,7 @@ def test_cdf_to_pandas(data, ordered): ], ) def test_list_dtype_pyarrow_round_trip(value_type): - pa_type = pa.list_(cudf.utils.dtypes.np_to_pa_dtype(np.dtype(value_type))) + pa_type = pa.list_(cudf_dtype_to_pa_type(cudf.dtype(value_type))) expect = pa_type got = ListDtype.from_arrow(expect).to_arrow() assert expect.equals(got) @@ -99,10 +99,7 @@ def test_list_nested_dtype(): ) def test_struct_dtype_pyarrow_round_trip(fields): pa_type = pa.struct( - { - k: cudf.utils.dtypes.np_to_pa_dtype(np.dtype(v)) - for k, v in fields.items() - } + {k: pa.from_numpy_dtype(np.dtype(v)) for k, v in fields.items()} ) expect = pa_type got = StructDtype.from_arrow(expect).to_arrow() @@ -215,7 +212,7 @@ def assert_column_array_dtype_equal(column: ColumnBase, array: pa.array): elif isinstance(column.dtype, CategoricalDtype): raise NotImplementedError() else: - return array.type.equals(np_to_pa_dtype(column.dtype)) + return array.type.equals(cudf_dtype_to_pa_type(column.dtype)) @pytest.mark.parametrize( @@ -353,12 +350,3 @@ def test_dtype(in_dtype, expect): def test_dtype_raise(in_dtype): with pytest.raises(TypeError): cudf.dtype(in_dtype) - - -def test_dtype_np_bool_to_pa_bool(): - """This test case captures that utility np_to_pa_dtype - should map np.bool_ to pa.bool_, nuances on bit width - difference should be handled elsewhere. - """ - - assert np_to_pa_dtype(np.dtype("bool")) == pa.bool_() diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 489b804583a..e59b8cb02b4 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -9,36 +9,15 @@ import numpy as np import pandas as pd import pyarrow as pa -from pandas.core.dtypes.common import infer_dtype_from_object import pylibcudf as plc import cudf if TYPE_CHECKING: - from cudf._typing import DtypeObj + from collections.abc import Iterable -"""Map numpy dtype to pyarrow types. -Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special -handling is required when converting a Boolean column into arrow. -""" -_np_pa_dtypes = { - np.float64: pa.float64(), - np.float32: pa.float32(), - np.int64: pa.int64(), - np.longlong: pa.int64(), - np.int32: pa.int32(), - np.int16: pa.int16(), - np.int8: pa.int8(), - np.bool_: pa.bool_(), - np.uint64: pa.uint64(), - np.uint32: pa.uint32(), - np.uint16: pa.uint16(), - np.uint8: pa.uint8(), - np.datetime64: pa.date64(), - np.object_: pa.string(), - np.str_: pa.string(), -} + from cudf._typing import DtypeObj np_dtypes_to_pandas_dtypes = { np.dtype("uint8"): pd.UInt8Dtype(), @@ -97,27 +76,11 @@ ALL_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | OTHER_TYPES -def np_to_pa_dtype(dtype: np.dtype) -> pa.DataType: - """Util to convert numpy dtype to PyArrow dtype.""" - # special case when dtype is np.datetime64 - if dtype.kind == "M": - time_unit, _ = np.datetime_data(dtype) - if time_unit in ("s", "ms", "us", "ns"): - # return a pa.Timestamp of the appropriate unit - return pa.timestamp(time_unit) - # default is int64_t UNIX ms - return pa.date64() - elif dtype.kind == "m": - time_unit, _ = np.datetime_data(dtype) - if time_unit in ("s", "ms", "us", "ns"): - # return a pa.Duration of the appropriate unit - return pa.duration(time_unit) - # default fallback unit is ns - return pa.duration("ns") - return _np_pa_dtypes[dtype.type] - - -def _find_common_type_decimal(dtypes): +def _find_common_type_decimal( + dtypes: Iterable[ + cudf.Decimal128Dtype | cudf.Decimal64Dtype | cudf.Decimal32Dtype + ], +) -> cudf.Decimal128Dtype | cudf.Decimal64Dtype | cudf.Decimal32Dtype: # Find the largest scale and the largest difference between # precision and scale of the columns to be concatenated s = max(dtype.scale for dtype in dtypes) @@ -140,25 +103,6 @@ def _find_common_type_decimal(dtypes): ) -def cudf_dtype_from_pydata_dtype(dtype): - """Given a numpy or pandas dtype, converts it into the equivalent cuDF - Python dtype. - """ - - if cudf.api.types._is_categorical_dtype(dtype): - return cudf.core.dtypes.CategoricalDtype - elif cudf.api.types.is_decimal32_dtype(dtype): - return cudf.core.dtypes.Decimal32Dtype - elif cudf.api.types.is_decimal64_dtype(dtype): - return cudf.core.dtypes.Decimal64Dtype - elif cudf.api.types.is_decimal128_dtype(dtype): - return cudf.core.dtypes.Decimal128Dtype - elif dtype in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES: - return dtype.type - - return infer_dtype_from_object(dtype) - - def cudf_dtype_to_pa_type(dtype: DtypeObj) -> pa.DataType: """Given a cudf pandas dtype, converts it into the equivalent cuDF Python dtype. @@ -172,8 +116,12 @@ def cudf_dtype_to_pa_type(dtype: DtypeObj) -> pa.DataType: (cudf.StructDtype, cudf.ListDtype, cudf.core.dtypes.DecimalDtype), ): return dtype.to_arrow() + elif isinstance(dtype, pd.DatetimeTZDtype): + return pa.timestamp(dtype.unit, str(dtype.tz)) + elif dtype == CUDF_STRING_DTYPE: + return pa.string() else: - return np_to_pa_dtype(dtype) + return pa.from_numpy_dtype(dtype) def cudf_dtype_from_pa_type(typ: pa.DataType) -> DtypeObj: @@ -186,7 +134,7 @@ def cudf_dtype_from_pa_type(typ: pa.DataType) -> DtypeObj: return cudf.core.dtypes.StructDtype.from_arrow(typ) elif pa.types.is_decimal(typ): return cudf.core.dtypes.Decimal128Dtype.from_arrow(typ) - elif pa.types.is_large_string(typ): + elif pa.types.is_large_string(typ) or pa.types.is_string(typ): return CUDF_STRING_DTYPE else: return cudf.api.types.pandas_dtype(typ.to_pandas_dtype()) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 9fb06faa66c..812bb48c4f5 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -23,7 +23,7 @@ from cudf.api.types import is_list_like from cudf.core._compat import PANDAS_LT_300 from cudf.utils.docutils import docfmt_partial -from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes, np_to_pa_dtype +from cudf.utils.dtypes import cudf_dtype_to_pa_type, np_dtypes_to_pandas_dtypes try: import fsspec.parquet as fsspec_parquet @@ -1546,25 +1546,14 @@ def generate_pandas_metadata(table: cudf.DataFrame, index: bool | None) -> str: else: col_names.append(name) - if isinstance(col.dtype, cudf.CategoricalDtype): - raise ValueError( - "'category' column dtypes are currently not " - + "supported by the gpu accelerated parquet writer" - ) - elif isinstance( - col.dtype, - (cudf.ListDtype, cudf.StructDtype, cudf.core.dtypes.DecimalDtype), - ): - types.append(col.dtype.to_arrow()) - else: + if col.dtype.kind == "b": # A boolean element takes 8 bits in cudf and 1 bit in # pyarrow. To make sure the cudf format is interoperable # with arrow, we use `int8` type when converting from a # cudf boolean array. - if col.dtype.type == np.bool_: - types.append(pa.int8()) - else: - types.append(np_to_pa_dtype(col.dtype)) + types.append(pa.int8()) + else: + types.append(cudf_dtype_to_pa_type(col.dtype)) # Indexes materialize_index = False @@ -1596,29 +1585,21 @@ def generate_pandas_metadata(table: cudf.DataFrame, index: bool | None) -> str: index_levels.append(materialized_idx) columns_to_convert.append(materialized_idx._values) col_names.append(descr) - types.append(np_to_pa_dtype(materialized_idx.dtype)) + types.append(pa.from_numpy_dtype(materialized_idx.dtype)) else: descr = _index_level_name( index_name=idx.name, level=level, column_names=col_names ) columns_to_convert.append(idx._values) col_names.append(descr) - if isinstance(idx.dtype, cudf.CategoricalDtype): - raise ValueError( - "'category' column dtypes are currently not " - + "supported by the gpu accelerated parquet writer" - ) - elif isinstance(idx.dtype, cudf.ListDtype): - types.append(col.dtype.to_arrow()) - else: + if idx.dtype.kind == "b": # A boolean element takes 8 bits in cudf and 1 bit in # pyarrow. To make sure the cudf format is interperable # in arrow, we use `int8` type when converting from a # cudf boolean array. - if idx.dtype.type == np.bool_: - types.append(pa.int8()) - else: - types.append(np_to_pa_dtype(idx.dtype)) + types.append(pa.int8()) + else: + types.append(cudf_dtype_to_pa_type(idx.dtype)) index_levels.append(idx) index_descriptors.append(descr)