Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prune some seldom used dtype utils #18141

Open
wants to merge 1 commit into
base: branch-25.04
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 1 addition & 22 deletions python/cudf/cudf/core/column/timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

import numpy as np
import pandas as pd
import pyarrow as pa

import pylibcudf as plc

Expand All @@ -19,7 +18,7 @@
from cudf.core._internals import binaryop
from cudf.core.buffer import Buffer, acquire_spill_lock
from cudf.core.column.column import ColumnBase
from cudf.utils.dtypes import CUDF_STRING_DTYPE, np_to_pa_dtype
from cudf.utils.dtypes import CUDF_STRING_DTYPE
from cudf.utils.utils import (
_all_bools_with_nulls,
_datetime_timedelta_find_and_replace,
Expand Down Expand Up @@ -179,26 +178,6 @@ def to_pandas(
pa_array.to_numpy(zero_copy_only=False, writable=True)
)

@acquire_spill_lock()
def to_arrow(self) -> pa.Array:
mask = None
if self.nullable:
mask = pa.py_buffer(
self.mask_array_view(mode="read").copy_to_host()
)
data = pa.py_buffer(
self.astype(np.dtype(np.int64))
.data_array_view(mode="read")
.copy_to_host()
)
pa_dtype = np_to_pa_dtype(self.dtype)
return pa.Array.from_buffers(
type=pa_dtype,
length=len(self),
buffers=[mask, data],
null_count=self.null_count,
)

def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
reflect, op = self._check_reflected_op(op)
other = self._wrap_binop_normalization(other)
Expand Down
18 changes: 17 additions & 1 deletion python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@
from cudf.utils.dtypes import (
CUDF_STRING_DTYPE,
SIZE_TYPE_DTYPE,
SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES,
can_convert_to_column,
cudf_dtype_from_pydata_dtype,
find_common_type,
is_column_like,
min_signed_type,
Expand Down Expand Up @@ -6828,6 +6828,22 @@ def select_dtypes(self, include=None, exclude=None):
if not isinstance(exclude, (list, tuple)):
exclude = (exclude,) if exclude is not None else ()

def cudf_dtype_from_pydata_dtype(dtype):
"""Given a numpy or pandas dtype, converts it into the equivalent cuDF
Python dtype.
"""
if cudf.api.types._is_categorical_dtype(dtype):
return cudf.core.dtypes.CategoricalDtype
elif cudf.api.types.is_decimal32_dtype(dtype):
return cudf.core.dtypes.Decimal32Dtype
elif cudf.api.types.is_decimal64_dtype(dtype):
return cudf.core.dtypes.Decimal64Dtype
elif cudf.api.types.is_decimal128_dtype(dtype):
return cudf.core.dtypes.Decimal128Dtype
elif dtype in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES:
return dtype.type
return pd.core.dtypes.common.infer_dtype_from_object(dtype)

# cudf_dtype_from_pydata_dtype can distinguish between
# np.float and np.number
selection = tuple(map(frozenset, (include, exclude)))
Expand Down
10 changes: 5 additions & 5 deletions python/cudf/cudf/testing/dataset_generator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
# Copyright (c) 2020-2025, NVIDIA CORPORATION.

# This module is for generating "synthetic" datasets. It was originally
# designed for testing filtered reading. Generally, it should be useful
Expand All @@ -17,7 +17,7 @@
from pyarrow import parquet as pq

import cudf
from cudf.utils.dtypes import np_to_pa_dtype
from cudf.utils.dtypes import cudf_dtype_to_pa_type


class ColumnParameters:
Expand Down Expand Up @@ -137,7 +137,7 @@ def _generate_column(column_params, num_rows, rng):
if hasattr(column_params.dtype, "to_arrow"):
arrow_type = column_params.dtype.to_arrow()
elif column_params.dtype is not None:
arrow_type = np_to_pa_dtype(cudf.dtype(column_params.dtype))
arrow_type = cudf_dtype_to_pa_type(cudf.dtype(column_params.dtype))
else:
arrow_type = None

Expand Down Expand Up @@ -254,14 +254,14 @@ def get_dataframe(parameters, use_threads):
):
arrow_type = pa.dictionary(
index_type=pa.int64(),
value_type=np_to_pa_dtype(
value_type=cudf_dtype_to_pa_type(
cudf.dtype(type(next(iter(column_params.generator))))
),
)
elif hasattr(column_params.dtype, "to_arrow"):
arrow_type = column_params.dtype.to_arrow()
else:
arrow_type = np_to_pa_dtype(
arrow_type = cudf_dtype_to_pa_type(
cudf.dtype(type(next(iter(column_params.generator))))
if column_params.dtype is None
else column_params.dtype
Expand Down
22 changes: 5 additions & 17 deletions python/cudf/cudf/tests/test_dtypes.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
# Copyright (c) 2020-2025, NVIDIA CORPORATION.

import numpy as np
import pandas as pd
Expand All @@ -18,7 +18,7 @@
StructDtype,
)
from cudf.testing import assert_eq
from cudf.utils.dtypes import np_to_pa_dtype
from cudf.utils.dtypes import cudf_dtype_to_pa_type


def test_cdt_basic():
Expand Down Expand Up @@ -67,7 +67,7 @@ def test_cdf_to_pandas(data, ordered):
],
)
def test_list_dtype_pyarrow_round_trip(value_type):
pa_type = pa.list_(cudf.utils.dtypes.np_to_pa_dtype(np.dtype(value_type)))
pa_type = pa.list_(cudf_dtype_to_pa_type(cudf.dtype(value_type)))
expect = pa_type
got = ListDtype.from_arrow(expect).to_arrow()
assert expect.equals(got)
Expand Down Expand Up @@ -99,10 +99,7 @@ def test_list_nested_dtype():
)
def test_struct_dtype_pyarrow_round_trip(fields):
pa_type = pa.struct(
{
k: cudf.utils.dtypes.np_to_pa_dtype(np.dtype(v))
for k, v in fields.items()
}
{k: pa.from_numpy_dtype(np.dtype(v)) for k, v in fields.items()}
)
expect = pa_type
got = StructDtype.from_arrow(expect).to_arrow()
Expand Down Expand Up @@ -215,7 +212,7 @@ def assert_column_array_dtype_equal(column: ColumnBase, array: pa.array):
elif isinstance(column.dtype, CategoricalDtype):
raise NotImplementedError()
else:
return array.type.equals(np_to_pa_dtype(column.dtype))
return array.type.equals(cudf_dtype_to_pa_type(column.dtype))


@pytest.mark.parametrize(
Expand Down Expand Up @@ -353,12 +350,3 @@ def test_dtype(in_dtype, expect):
def test_dtype_raise(in_dtype):
with pytest.raises(TypeError):
cudf.dtype(in_dtype)


def test_dtype_np_bool_to_pa_bool():
"""This test case captures that utility np_to_pa_dtype
should map np.bool_ to pa.bool_, nuances on bit width
difference should be handled elsewhere.
"""

assert np_to_pa_dtype(np.dtype("bool")) == pa.bool_()
78 changes: 13 additions & 65 deletions python/cudf/cudf/utils/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,36 +9,15 @@
import numpy as np
import pandas as pd
import pyarrow as pa
from pandas.core.dtypes.common import infer_dtype_from_object

import pylibcudf as plc

import cudf

if TYPE_CHECKING:
from cudf._typing import DtypeObj
from collections.abc import Iterable

"""Map numpy dtype to pyarrow types.
Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special
handling is required when converting a Boolean column into arrow.
"""
_np_pa_dtypes = {
np.float64: pa.float64(),
np.float32: pa.float32(),
np.int64: pa.int64(),
np.longlong: pa.int64(),
np.int32: pa.int32(),
np.int16: pa.int16(),
np.int8: pa.int8(),
np.bool_: pa.bool_(),
np.uint64: pa.uint64(),
np.uint32: pa.uint32(),
np.uint16: pa.uint16(),
np.uint8: pa.uint8(),
np.datetime64: pa.date64(),
np.object_: pa.string(),
np.str_: pa.string(),
}
from cudf._typing import DtypeObj

np_dtypes_to_pandas_dtypes = {
np.dtype("uint8"): pd.UInt8Dtype(),
Expand Down Expand Up @@ -97,27 +76,11 @@
ALL_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | OTHER_TYPES


def np_to_pa_dtype(dtype: np.dtype) -> pa.DataType:
"""Util to convert numpy dtype to PyArrow dtype."""
# special case when dtype is np.datetime64
if dtype.kind == "M":
time_unit, _ = np.datetime_data(dtype)
if time_unit in ("s", "ms", "us", "ns"):
# return a pa.Timestamp of the appropriate unit
return pa.timestamp(time_unit)
# default is int64_t UNIX ms
return pa.date64()
elif dtype.kind == "m":
time_unit, _ = np.datetime_data(dtype)
if time_unit in ("s", "ms", "us", "ns"):
# return a pa.Duration of the appropriate unit
return pa.duration(time_unit)
# default fallback unit is ns
return pa.duration("ns")
return _np_pa_dtypes[dtype.type]


def _find_common_type_decimal(dtypes):
def _find_common_type_decimal(
dtypes: Iterable[
cudf.Decimal128Dtype | cudf.Decimal64Dtype | cudf.Decimal32Dtype
],
) -> cudf.Decimal128Dtype | cudf.Decimal64Dtype | cudf.Decimal32Dtype:
# Find the largest scale and the largest difference between
# precision and scale of the columns to be concatenated
s = max(dtype.scale for dtype in dtypes)
Expand All @@ -140,25 +103,6 @@ def _find_common_type_decimal(dtypes):
)


def cudf_dtype_from_pydata_dtype(dtype):
"""Given a numpy or pandas dtype, converts it into the equivalent cuDF
Python dtype.
"""

if cudf.api.types._is_categorical_dtype(dtype):
return cudf.core.dtypes.CategoricalDtype
elif cudf.api.types.is_decimal32_dtype(dtype):
return cudf.core.dtypes.Decimal32Dtype
elif cudf.api.types.is_decimal64_dtype(dtype):
return cudf.core.dtypes.Decimal64Dtype
elif cudf.api.types.is_decimal128_dtype(dtype):
return cudf.core.dtypes.Decimal128Dtype
elif dtype in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES:
return dtype.type

return infer_dtype_from_object(dtype)


def cudf_dtype_to_pa_type(dtype: DtypeObj) -> pa.DataType:
"""Given a cudf pandas dtype, converts it into the equivalent cuDF
Python dtype.
Expand All @@ -172,8 +116,12 @@ def cudf_dtype_to_pa_type(dtype: DtypeObj) -> pa.DataType:
(cudf.StructDtype, cudf.ListDtype, cudf.core.dtypes.DecimalDtype),
):
return dtype.to_arrow()
elif isinstance(dtype, pd.DatetimeTZDtype):
return pa.timestamp(dtype.unit, str(dtype.tz))
elif dtype == CUDF_STRING_DTYPE:
return pa.string()
else:
return np_to_pa_dtype(dtype)
return pa.from_numpy_dtype(dtype)


def cudf_dtype_from_pa_type(typ: pa.DataType) -> DtypeObj:
Expand All @@ -186,7 +134,7 @@ def cudf_dtype_from_pa_type(typ: pa.DataType) -> DtypeObj:
return cudf.core.dtypes.StructDtype.from_arrow(typ)
elif pa.types.is_decimal(typ):
return cudf.core.dtypes.Decimal128Dtype.from_arrow(typ)
elif pa.types.is_large_string(typ):
elif pa.types.is_large_string(typ) or pa.types.is_string(typ):
return CUDF_STRING_DTYPE
else:
return cudf.api.types.pandas_dtype(typ.to_pandas_dtype())
Expand Down
39 changes: 10 additions & 29 deletions python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from cudf.api.types import is_list_like
from cudf.core._compat import PANDAS_LT_300
from cudf.utils.docutils import docfmt_partial
from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes, np_to_pa_dtype
from cudf.utils.dtypes import cudf_dtype_to_pa_type, np_dtypes_to_pandas_dtypes

try:
import fsspec.parquet as fsspec_parquet
Expand Down Expand Up @@ -1546,25 +1546,14 @@ def generate_pandas_metadata(table: cudf.DataFrame, index: bool | None) -> str:
else:
col_names.append(name)

if isinstance(col.dtype, cudf.CategoricalDtype):
raise ValueError(
"'category' column dtypes are currently not "
+ "supported by the gpu accelerated parquet writer"
)
elif isinstance(
col.dtype,
(cudf.ListDtype, cudf.StructDtype, cudf.core.dtypes.DecimalDtype),
):
types.append(col.dtype.to_arrow())
else:
if col.dtype.kind == "b":
# A boolean element takes 8 bits in cudf and 1 bit in
# pyarrow. To make sure the cudf format is interoperable
# with arrow, we use `int8` type when converting from a
# cudf boolean array.
if col.dtype.type == np.bool_:
types.append(pa.int8())
else:
types.append(np_to_pa_dtype(col.dtype))
types.append(pa.int8())
else:
types.append(cudf_dtype_to_pa_type(col.dtype))

# Indexes
materialize_index = False
Expand Down Expand Up @@ -1596,29 +1585,21 @@ def generate_pandas_metadata(table: cudf.DataFrame, index: bool | None) -> str:
index_levels.append(materialized_idx)
columns_to_convert.append(materialized_idx._values)
col_names.append(descr)
types.append(np_to_pa_dtype(materialized_idx.dtype))
types.append(pa.from_numpy_dtype(materialized_idx.dtype))
else:
descr = _index_level_name(
index_name=idx.name, level=level, column_names=col_names
)
columns_to_convert.append(idx._values)
col_names.append(descr)
if isinstance(idx.dtype, cudf.CategoricalDtype):
raise ValueError(
"'category' column dtypes are currently not "
+ "supported by the gpu accelerated parquet writer"
)
elif isinstance(idx.dtype, cudf.ListDtype):
types.append(col.dtype.to_arrow())
else:
if idx.dtype.kind == "b":
# A boolean element takes 8 bits in cudf and 1 bit in
# pyarrow. To make sure the cudf format is interperable
# in arrow, we use `int8` type when converting from a
# cudf boolean array.
if idx.dtype.type == np.bool_:
types.append(pa.int8())
else:
types.append(np_to_pa_dtype(idx.dtype))
types.append(pa.int8())
else:
types.append(cudf_dtype_to_pa_type(idx.dtype))

index_levels.append(idx)
index_descriptors.append(descr)
Expand Down
Loading