Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CSV Reader options classes to pylibcudf #17412

Merged
merged 11 commits into from
Nov 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 62 additions & 37 deletions python/cudf/cudf/_lib/csv.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -202,46 +202,71 @@ def read_csv(
raise ValueError(
"dtype should be a scalar/str/list-like/dict-like"
)
options = (
plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([datasource]))
.compression(c_compression)
.mangle_dupe_cols(mangle_dupe_cols)
.byte_range_offset(byte_range[0])
.byte_range_size(byte_range[1])
.nrows(nrows if nrows is not None else -1)
.skiprows(skiprows)
.skipfooter(skipfooter)
.quoting(quoting)
.lineterminator(str(lineterminator))
.quotechar(quotechar)
.decimal(decimal)
.delim_whitespace(delim_whitespace)
.skipinitialspace(skipinitialspace)
.skip_blank_lines(skip_blank_lines)
.doublequote(doublequote)
.keep_default_na(keep_default_na)
.na_filter(na_filter)
.dayfirst(dayfirst)
.build()
)

options.set_header(header)

if names is not None:
options.set_names([str(name) for name in names])

if prefix is not None:
options.set_prefix(prefix)

if usecols is not None:
if all(isinstance(col, int) for col in usecols):
options.set_use_cols_indexes(list(usecols))
else:
options.set_use_cols_names([str(name) for name in usecols])

if delimiter is not None:
options.set_delimiter(delimiter)

if thousands is not None:
options.set_thousands(thousands)

lineterminator = str(lineterminator)
if comment is not None:
options.set_comment(comment)

if parse_dates is not None:
options.set_parse_dates(list(parse_dates))

if hex_cols is not None:
options.set_parse_hex(list(hex_cols))

options.set_dtypes(new_dtypes)

if true_values is not None:
options.set_true_values([str(val) for val in true_values])

if false_values is not None:
options.set_false_values([str(val) for val in false_values])

if na_values is not None:
options.set_na_values([str(val) for val in na_values])

df = cudf.DataFrame._from_data(
*data_from_pylibcudf_io(
plc.io.csv.read_csv(
plc.io.SourceInfo([datasource]),
lineterminator=lineterminator,
quotechar = quotechar,
quoting = quoting,
doublequote = doublequote,
header = header,
mangle_dupe_cols = mangle_dupe_cols,
usecols = usecols,
delimiter = delimiter,
delim_whitespace = delim_whitespace,
skipinitialspace = skipinitialspace,
col_names = names,
dtypes = new_dtypes,
skipfooter = skipfooter,
skiprows = skiprows,
dayfirst = dayfirst,
compression = c_compression,
thousands = thousands,
decimal = decimal,
true_values = true_values,
false_values = false_values,
nrows = nrows if nrows is not None else -1,
byte_range_offset = byte_range[0],
byte_range_size = byte_range[1],
skip_blank_lines = skip_blank_lines,
parse_dates = parse_dates,
parse_hex = hex_cols,
comment = comment,
na_values = na_values,
keep_default_na = keep_default_na,
na_filter = na_filter,
prefix = prefix,
)
)
*data_from_pylibcudf_io(plc.io.csv.read_csv(options))
)

if dtype is not None:
Expand Down
37 changes: 21 additions & 16 deletions python/cudf_polars/cudf_polars/dsl/ir.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,23 +476,28 @@ def do_evaluate(
with path.open() as f:
while f.readline() == "\n":
skiprows += 1
tbl_w_meta = plc.io.csv.read_csv(
plc.io.SourceInfo([path]),
delimiter=sep,
quotechar=quote,
lineterminator=eol,
col_names=column_names,
header=header,
usecols=usecols,
na_filter=True,
na_values=null_values,
keep_default_na=False,
skiprows=skiprows,
comment=comment,
decimal=decimal,
dtypes=schema,
nrows=n_rows,
options = (
plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([path]))
.nrows(n_rows)
.skiprows(skiprows)
.lineterminator(str(eol))
.quotechar(str(quote))
.decimal(decimal)
.keep_default_na(keep_default_na=False)
.na_filter(na_filter=True)
.build()
)
options.set_delimiter(str(sep))
if column_names is not None:
options.set_names([str(name) for name in column_names])
options.set_header(header)
options.set_dtypes(schema)
if usecols is not None:
options.set_use_cols_names([str(name) for name in usecols])
options.set_na_values(null_values)
if comment is not None:
options.set_comment(comment)
tbl_w_meta = plc.io.csv.read_csv(options)
pieces.append(tbl_w_meta)
if read_partial:
n_rows -= tbl_w_meta.tbl.num_rows()
Expand Down
56 changes: 54 additions & 2 deletions python/pylibcudf/pylibcudf/io/csv.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,63 @@ from libcpp cimport bool
from pylibcudf.libcudf.io.csv cimport (
csv_writer_options,
csv_writer_options_builder,
csv_reader_options,
csv_reader_options_builder,
)
from pylibcudf.libcudf.io.types cimport quote_style
from pylibcudf.io.types cimport SinkInfo
from pylibcudf.io.types cimport SinkInfo, SourceInfo, TableWithMetadata
from pylibcudf.table cimport Table

from pylibcudf.libcudf.io.types cimport (
compression_type,
quote_style,
table_with_metadata,
)
from pylibcudf.libcudf.types cimport size_type

cdef class CsvReaderOptions:
cdef csv_reader_options c_obj
cdef SourceInfo source
cpdef void set_header(self, size_type header)
cpdef void set_names(self, list col_names)
cpdef void set_prefix(self, str prefix)
cpdef void set_use_cols_indexes(self, list col_indices)
cpdef void set_use_cols_names(self, list col_names)
cpdef void set_delimiter(self, str delimiter)
cpdef void set_thousands(self, str thousands)
cpdef void set_comment(self, str comment)
cpdef void set_parse_dates(self, list val)
cpdef void set_parse_hex(self, list val)
cpdef void set_dtypes(self, object types)
cpdef void set_true_values(self, list true_values)
cpdef void set_false_values(self, list false_values)
cpdef void set_na_values(self, list na_values)


cdef class CsvReaderOptionsBuilder:
cdef csv_reader_options_builder c_obj
cdef SourceInfo source
cpdef CsvReaderOptionsBuilder compression(self, compression_type compression)
cpdef CsvReaderOptionsBuilder mangle_dupe_cols(self, bool mangle_dupe_cols)
cpdef CsvReaderOptionsBuilder byte_range_offset(self, size_t byte_range_offset)
cpdef CsvReaderOptionsBuilder byte_range_size(self, size_t byte_range_size)
cpdef CsvReaderOptionsBuilder nrows(self, size_type nrows)
cpdef CsvReaderOptionsBuilder skiprows(self, size_type skiprows)
cpdef CsvReaderOptionsBuilder skipfooter(self, size_type skipfooter)
cpdef CsvReaderOptionsBuilder quoting(self, quote_style quoting)
cpdef CsvReaderOptionsBuilder lineterminator(self, str lineterminator)
cpdef CsvReaderOptionsBuilder quotechar(self, str quotechar)
cpdef CsvReaderOptionsBuilder decimal(self, str decimal)
cpdef CsvReaderOptionsBuilder delim_whitespace(self, bool delim_whitespace)
cpdef CsvReaderOptionsBuilder skipinitialspace(self, bool skipinitialspace)
cpdef CsvReaderOptionsBuilder skip_blank_lines(self, bool skip_blank_lines)
cpdef CsvReaderOptionsBuilder doublequote(self, bool doublequote)
cpdef CsvReaderOptionsBuilder keep_default_na(self, bool keep_default_na)
cpdef CsvReaderOptionsBuilder na_filter(self, bool na_filter)
cpdef CsvReaderOptionsBuilder dayfirst(self, bool dayfirst)
cpdef CsvReaderOptions build(self)

cpdef TableWithMetadata read_csv(CsvReaderOptions options)

cdef class CsvWriterOptions:
cdef csv_writer_options c_obj
cdef Table table
Expand Down
63 changes: 52 additions & 11 deletions python/pylibcudf/pylibcudf/io/csv.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from collections.abc import Mapping

from typing_extensions import Self

from pylibcudf.io.types import (
CompressionType,
QuoteStyle,
Expand All @@ -12,6 +14,47 @@ from pylibcudf.io.types import (
from pylibcudf.table import Table
from pylibcudf.types import DataType

class CsvReaderOptions:
def __init__(self): ...
def set_header(self, header: int): ...
def set_names(self, col_names: list[str]): ...
def set_prefix(self, prefix: str): ...
def set_use_cols_indexes(self, col_indices: list[int]): ...
def set_use_cols_names(self, col_names: list[str]): ...
def set_delimiter(self, delimiter: str): ...
def set_thousands(self, thousands: str): ...
def set_comment(self, comment: str): ...
def set_parse_dates(self, val: list[int | str]): ...
def set_parse_hex(self, val: list[int | str]): ...
def set_dtypes(self, types: dict[str, DataType] | list[DataType]): ...
def set_true_values(self, true_values: list[str]): ...
def set_false_values(self, false_values: list[str]): ...
def set_na_values(self, na_values: list[str]): ...
@staticmethod
def builder(source: SourceInfo) -> CsvReaderOptionsBuilder: ...

class CsvReaderOptionsBuilder:
def __init__(self): ...
def compression(self, compression: CompressionType) -> Self: ...
def mangle_dupe_cols(self, mangle_dupe_cols: bool) -> Self: ...
def byte_range_offset(self, byte_range_offset: int) -> Self: ...
def byte_range_size(self, byte_range_size: int) -> Self: ...
def nrows(self, nrows: int) -> Self: ...
def skiprows(self, skiprows: int) -> Self: ...
def skipfooter(self, skipfooter: int) -> Self: ...
def quoting(self, quoting: QuoteStyle) -> Self: ...
def lineterminator(self, lineterminator: str) -> Self: ...
def quotechar(self, quotechar: str) -> Self: ...
def decimal(self, decimal: str) -> Self: ...
def delim_whitespace(self, delim_whitespace: bool) -> Self: ...
def skipinitialspace(self, skipinitialspace: bool) -> Self: ...
def skip_blank_lines(self, skip_blank_lines: bool) -> Self: ...
def doublequote(self, doublequote: bool) -> Self: ...
def keep_default_na(self, keep_default_na: bool) -> Self: ...
def na_filter(self, na_filter: bool) -> Self: ...
def dayfirst(self, dayfirst: bool) -> Self: ...
def build(self) -> CsvReaderOptions: ...

def read_csv(
source_info: SourceInfo,
*,
Expand Down Expand Up @@ -54,7 +97,7 @@ def read_csv(
# detect_whitespace_around_quotes: bool = False,
# timestamp_type: DataType = DataType(type_id.EMPTY),
) -> TableWithMetadata: ...
def write_csv(options: CsvWriterOptionsBuilder) -> None: ...
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was this annotation incorrect before?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No I don't think so. I wanted to stay consistent because I noticed we dropped the None in other PRs.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I see. IMO we should still include None as the output type cc @wence-

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll follow up based on what we decide

def write_csv(options: CsvWriterOptionsBuilder): ...

class CsvWriterOptions:
def __init__(self): ...
Expand All @@ -63,14 +106,12 @@ class CsvWriterOptions:

class CsvWriterOptionsBuilder:
def __init__(self): ...
def names(self, names: list) -> CsvWriterOptionsBuilder: ...
def na_rep(self, val: str) -> CsvWriterOptionsBuilder: ...
def include_header(self, val: bool) -> CsvWriterOptionsBuilder: ...
def rows_per_chunk(self, val: int) -> CsvWriterOptionsBuilder: ...
def line_terminator(self, term: str) -> CsvWriterOptionsBuilder: ...
def inter_column_delimiter(
self, delim: str
) -> CsvWriterOptionsBuilder: ...
def true_value(self, val: str) -> CsvWriterOptionsBuilder: ...
def false_value(self, val: str) -> CsvWriterOptionsBuilder: ...
def names(self, names: list) -> Self: ...
def na_rep(self, val: str) -> Self: ...
def include_header(self, val: bool) -> Self: ...
def rows_per_chunk(self, val: int) -> Self: ...
def line_terminator(self, term: str) -> Self: ...
def inter_column_delimiter(self, delim: str) -> Self: ...
def true_value(self, val: str) -> Self: ...
def false_value(self, val: str) -> Self: ...
def build(self) -> CsvWriterOptions: ...
Loading
Loading