Skip to content

Commit

Permalink
Add CSV Reader options classes to pylibcudf (#17412)
Browse files Browse the repository at this point in the history
This PR adds the CSV reader options classes to pylibcudf and plumbs the changes through cudf python.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: #17412
  • Loading branch information
Matt711 authored Nov 27, 2024
1 parent 797a07b commit 4533085
Show file tree
Hide file tree
Showing 6 changed files with 853 additions and 332 deletions.
99 changes: 62 additions & 37 deletions python/cudf/cudf/_lib/csv.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -202,46 +202,71 @@ def read_csv(
raise ValueError(
"dtype should be a scalar/str/list-like/dict-like"
)
options = (
plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([datasource]))
.compression(c_compression)
.mangle_dupe_cols(mangle_dupe_cols)
.byte_range_offset(byte_range[0])
.byte_range_size(byte_range[1])
.nrows(nrows if nrows is not None else -1)
.skiprows(skiprows)
.skipfooter(skipfooter)
.quoting(quoting)
.lineterminator(str(lineterminator))
.quotechar(quotechar)
.decimal(decimal)
.delim_whitespace(delim_whitespace)
.skipinitialspace(skipinitialspace)
.skip_blank_lines(skip_blank_lines)
.doublequote(doublequote)
.keep_default_na(keep_default_na)
.na_filter(na_filter)
.dayfirst(dayfirst)
.build()
)

options.set_header(header)

if names is not None:
options.set_names([str(name) for name in names])

if prefix is not None:
options.set_prefix(prefix)

if usecols is not None:
if all(isinstance(col, int) for col in usecols):
options.set_use_cols_indexes(list(usecols))
else:
options.set_use_cols_names([str(name) for name in usecols])

if delimiter is not None:
options.set_delimiter(delimiter)

if thousands is not None:
options.set_thousands(thousands)

lineterminator = str(lineterminator)
if comment is not None:
options.set_comment(comment)

if parse_dates is not None:
options.set_parse_dates(list(parse_dates))

if hex_cols is not None:
options.set_parse_hex(list(hex_cols))

options.set_dtypes(new_dtypes)

if true_values is not None:
options.set_true_values([str(val) for val in true_values])

if false_values is not None:
options.set_false_values([str(val) for val in false_values])

if na_values is not None:
options.set_na_values([str(val) for val in na_values])

df = cudf.DataFrame._from_data(
*data_from_pylibcudf_io(
plc.io.csv.read_csv(
plc.io.SourceInfo([datasource]),
lineterminator=lineterminator,
quotechar = quotechar,
quoting = quoting,
doublequote = doublequote,
header = header,
mangle_dupe_cols = mangle_dupe_cols,
usecols = usecols,
delimiter = delimiter,
delim_whitespace = delim_whitespace,
skipinitialspace = skipinitialspace,
col_names = names,
dtypes = new_dtypes,
skipfooter = skipfooter,
skiprows = skiprows,
dayfirst = dayfirst,
compression = c_compression,
thousands = thousands,
decimal = decimal,
true_values = true_values,
false_values = false_values,
nrows = nrows if nrows is not None else -1,
byte_range_offset = byte_range[0],
byte_range_size = byte_range[1],
skip_blank_lines = skip_blank_lines,
parse_dates = parse_dates,
parse_hex = hex_cols,
comment = comment,
na_values = na_values,
keep_default_na = keep_default_na,
na_filter = na_filter,
prefix = prefix,
)
)
*data_from_pylibcudf_io(plc.io.csv.read_csv(options))
)

if dtype is not None:
Expand Down
37 changes: 21 additions & 16 deletions python/cudf_polars/cudf_polars/dsl/ir.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,23 +476,28 @@ def do_evaluate(
with path.open() as f:
while f.readline() == "\n":
skiprows += 1
tbl_w_meta = plc.io.csv.read_csv(
plc.io.SourceInfo([path]),
delimiter=sep,
quotechar=quote,
lineterminator=eol,
col_names=column_names,
header=header,
usecols=usecols,
na_filter=True,
na_values=null_values,
keep_default_na=False,
skiprows=skiprows,
comment=comment,
decimal=decimal,
dtypes=schema,
nrows=n_rows,
options = (
plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([path]))
.nrows(n_rows)
.skiprows(skiprows)
.lineterminator(str(eol))
.quotechar(str(quote))
.decimal(decimal)
.keep_default_na(keep_default_na=False)
.na_filter(na_filter=True)
.build()
)
options.set_delimiter(str(sep))
if column_names is not None:
options.set_names([str(name) for name in column_names])
options.set_header(header)
options.set_dtypes(schema)
if usecols is not None:
options.set_use_cols_names([str(name) for name in usecols])
options.set_na_values(null_values)
if comment is not None:
options.set_comment(comment)
tbl_w_meta = plc.io.csv.read_csv(options)
pieces.append(tbl_w_meta)
if read_partial:
n_rows -= tbl_w_meta.tbl.num_rows()
Expand Down
56 changes: 54 additions & 2 deletions python/pylibcudf/pylibcudf/io/csv.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,63 @@ from libcpp cimport bool
from pylibcudf.libcudf.io.csv cimport (
csv_writer_options,
csv_writer_options_builder,
csv_reader_options,
csv_reader_options_builder,
)
from pylibcudf.libcudf.io.types cimport quote_style
from pylibcudf.io.types cimport SinkInfo
from pylibcudf.io.types cimport SinkInfo, SourceInfo, TableWithMetadata
from pylibcudf.table cimport Table

from pylibcudf.libcudf.io.types cimport (
compression_type,
quote_style,
table_with_metadata,
)
from pylibcudf.libcudf.types cimport size_type

cdef class CsvReaderOptions:
cdef csv_reader_options c_obj
cdef SourceInfo source
cpdef void set_header(self, size_type header)
cpdef void set_names(self, list col_names)
cpdef void set_prefix(self, str prefix)
cpdef void set_use_cols_indexes(self, list col_indices)
cpdef void set_use_cols_names(self, list col_names)
cpdef void set_delimiter(self, str delimiter)
cpdef void set_thousands(self, str thousands)
cpdef void set_comment(self, str comment)
cpdef void set_parse_dates(self, list val)
cpdef void set_parse_hex(self, list val)
cpdef void set_dtypes(self, object types)
cpdef void set_true_values(self, list true_values)
cpdef void set_false_values(self, list false_values)
cpdef void set_na_values(self, list na_values)


cdef class CsvReaderOptionsBuilder:
cdef csv_reader_options_builder c_obj
cdef SourceInfo source
cpdef CsvReaderOptionsBuilder compression(self, compression_type compression)
cpdef CsvReaderOptionsBuilder mangle_dupe_cols(self, bool mangle_dupe_cols)
cpdef CsvReaderOptionsBuilder byte_range_offset(self, size_t byte_range_offset)
cpdef CsvReaderOptionsBuilder byte_range_size(self, size_t byte_range_size)
cpdef CsvReaderOptionsBuilder nrows(self, size_type nrows)
cpdef CsvReaderOptionsBuilder skiprows(self, size_type skiprows)
cpdef CsvReaderOptionsBuilder skipfooter(self, size_type skipfooter)
cpdef CsvReaderOptionsBuilder quoting(self, quote_style quoting)
cpdef CsvReaderOptionsBuilder lineterminator(self, str lineterminator)
cpdef CsvReaderOptionsBuilder quotechar(self, str quotechar)
cpdef CsvReaderOptionsBuilder decimal(self, str decimal)
cpdef CsvReaderOptionsBuilder delim_whitespace(self, bool delim_whitespace)
cpdef CsvReaderOptionsBuilder skipinitialspace(self, bool skipinitialspace)
cpdef CsvReaderOptionsBuilder skip_blank_lines(self, bool skip_blank_lines)
cpdef CsvReaderOptionsBuilder doublequote(self, bool doublequote)
cpdef CsvReaderOptionsBuilder keep_default_na(self, bool keep_default_na)
cpdef CsvReaderOptionsBuilder na_filter(self, bool na_filter)
cpdef CsvReaderOptionsBuilder dayfirst(self, bool dayfirst)
cpdef CsvReaderOptions build(self)

cpdef TableWithMetadata read_csv(CsvReaderOptions options)

cdef class CsvWriterOptions:
cdef csv_writer_options c_obj
cdef Table table
Expand Down
63 changes: 52 additions & 11 deletions python/pylibcudf/pylibcudf/io/csv.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from collections.abc import Mapping

from typing_extensions import Self

from pylibcudf.io.types import (
CompressionType,
QuoteStyle,
Expand All @@ -12,6 +14,47 @@ from pylibcudf.io.types import (
from pylibcudf.table import Table
from pylibcudf.types import DataType

class CsvReaderOptions:
def __init__(self): ...
def set_header(self, header: int): ...
def set_names(self, col_names: list[str]): ...
def set_prefix(self, prefix: str): ...
def set_use_cols_indexes(self, col_indices: list[int]): ...
def set_use_cols_names(self, col_names: list[str]): ...
def set_delimiter(self, delimiter: str): ...
def set_thousands(self, thousands: str): ...
def set_comment(self, comment: str): ...
def set_parse_dates(self, val: list[int | str]): ...
def set_parse_hex(self, val: list[int | str]): ...
def set_dtypes(self, types: dict[str, DataType] | list[DataType]): ...
def set_true_values(self, true_values: list[str]): ...
def set_false_values(self, false_values: list[str]): ...
def set_na_values(self, na_values: list[str]): ...
@staticmethod
def builder(source: SourceInfo) -> CsvReaderOptionsBuilder: ...

class CsvReaderOptionsBuilder:
def __init__(self): ...
def compression(self, compression: CompressionType) -> Self: ...
def mangle_dupe_cols(self, mangle_dupe_cols: bool) -> Self: ...
def byte_range_offset(self, byte_range_offset: int) -> Self: ...
def byte_range_size(self, byte_range_size: int) -> Self: ...
def nrows(self, nrows: int) -> Self: ...
def skiprows(self, skiprows: int) -> Self: ...
def skipfooter(self, skipfooter: int) -> Self: ...
def quoting(self, quoting: QuoteStyle) -> Self: ...
def lineterminator(self, lineterminator: str) -> Self: ...
def quotechar(self, quotechar: str) -> Self: ...
def decimal(self, decimal: str) -> Self: ...
def delim_whitespace(self, delim_whitespace: bool) -> Self: ...
def skipinitialspace(self, skipinitialspace: bool) -> Self: ...
def skip_blank_lines(self, skip_blank_lines: bool) -> Self: ...
def doublequote(self, doublequote: bool) -> Self: ...
def keep_default_na(self, keep_default_na: bool) -> Self: ...
def na_filter(self, na_filter: bool) -> Self: ...
def dayfirst(self, dayfirst: bool) -> Self: ...
def build(self) -> CsvReaderOptions: ...

def read_csv(
source_info: SourceInfo,
*,
Expand Down Expand Up @@ -54,7 +97,7 @@ def read_csv(
# detect_whitespace_around_quotes: bool = False,
# timestamp_type: DataType = DataType(type_id.EMPTY),
) -> TableWithMetadata: ...
def write_csv(options: CsvWriterOptionsBuilder) -> None: ...
def write_csv(options: CsvWriterOptionsBuilder): ...

class CsvWriterOptions:
def __init__(self): ...
Expand All @@ -63,14 +106,12 @@ class CsvWriterOptions:

class CsvWriterOptionsBuilder:
def __init__(self): ...
def names(self, names: list) -> CsvWriterOptionsBuilder: ...
def na_rep(self, val: str) -> CsvWriterOptionsBuilder: ...
def include_header(self, val: bool) -> CsvWriterOptionsBuilder: ...
def rows_per_chunk(self, val: int) -> CsvWriterOptionsBuilder: ...
def line_terminator(self, term: str) -> CsvWriterOptionsBuilder: ...
def inter_column_delimiter(
self, delim: str
) -> CsvWriterOptionsBuilder: ...
def true_value(self, val: str) -> CsvWriterOptionsBuilder: ...
def false_value(self, val: str) -> CsvWriterOptionsBuilder: ...
def names(self, names: list) -> Self: ...
def na_rep(self, val: str) -> Self: ...
def include_header(self, val: bool) -> Self: ...
def rows_per_chunk(self, val: int) -> Self: ...
def line_terminator(self, term: str) -> Self: ...
def inter_column_delimiter(self, delim: str) -> Self: ...
def true_value(self, val: str) -> Self: ...
def false_value(self, val: str) -> Self: ...
def build(self) -> CsvWriterOptions: ...
Loading

0 comments on commit 4533085

Please sign in to comment.