diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 59a970263e0..641fc18c203 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -202,46 +202,71 @@ def read_csv( raise ValueError( "dtype should be a scalar/str/list-like/dict-like" ) + options = ( + plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([datasource])) + .compression(c_compression) + .mangle_dupe_cols(mangle_dupe_cols) + .byte_range_offset(byte_range[0]) + .byte_range_size(byte_range[1]) + .nrows(nrows if nrows is not None else -1) + .skiprows(skiprows) + .skipfooter(skipfooter) + .quoting(quoting) + .lineterminator(str(lineterminator)) + .quotechar(quotechar) + .decimal(decimal) + .delim_whitespace(delim_whitespace) + .skipinitialspace(skipinitialspace) + .skip_blank_lines(skip_blank_lines) + .doublequote(doublequote) + .keep_default_na(keep_default_na) + .na_filter(na_filter) + .dayfirst(dayfirst) + .build() + ) + + options.set_header(header) + + if names is not None: + options.set_names([str(name) for name in names]) + + if prefix is not None: + options.set_prefix(prefix) + + if usecols is not None: + if all(isinstance(col, int) for col in usecols): + options.set_use_cols_indexes(list(usecols)) + else: + options.set_use_cols_names([str(name) for name in usecols]) + + if delimiter is not None: + options.set_delimiter(delimiter) + + if thousands is not None: + options.set_thousands(thousands) - lineterminator = str(lineterminator) + if comment is not None: + options.set_comment(comment) + + if parse_dates is not None: + options.set_parse_dates(list(parse_dates)) + + if hex_cols is not None: + options.set_parse_hex(list(hex_cols)) + + options.set_dtypes(new_dtypes) + + if true_values is not None: + options.set_true_values([str(val) for val in true_values]) + + if false_values is not None: + options.set_false_values([str(val) for val in false_values]) + + if na_values is not None: + options.set_na_values([str(val) for val in na_values]) df = cudf.DataFrame._from_data( - *data_from_pylibcudf_io( - plc.io.csv.read_csv( - plc.io.SourceInfo([datasource]), - lineterminator=lineterminator, - quotechar = quotechar, - quoting = quoting, - doublequote = doublequote, - header = header, - mangle_dupe_cols = mangle_dupe_cols, - usecols = usecols, - delimiter = delimiter, - delim_whitespace = delim_whitespace, - skipinitialspace = skipinitialspace, - col_names = names, - dtypes = new_dtypes, - skipfooter = skipfooter, - skiprows = skiprows, - dayfirst = dayfirst, - compression = c_compression, - thousands = thousands, - decimal = decimal, - true_values = true_values, - false_values = false_values, - nrows = nrows if nrows is not None else -1, - byte_range_offset = byte_range[0], - byte_range_size = byte_range[1], - skip_blank_lines = skip_blank_lines, - parse_dates = parse_dates, - parse_hex = hex_cols, - comment = comment, - na_values = na_values, - keep_default_na = keep_default_na, - na_filter = na_filter, - prefix = prefix, - ) - ) + *data_from_pylibcudf_io(plc.io.csv.read_csv(options)) ) if dtype is not None: diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 62a2da9dcea..a5441e9d59f 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -476,23 +476,28 @@ def do_evaluate( with path.open() as f: while f.readline() == "\n": skiprows += 1 - tbl_w_meta = plc.io.csv.read_csv( - plc.io.SourceInfo([path]), - delimiter=sep, - quotechar=quote, - lineterminator=eol, - col_names=column_names, - header=header, - usecols=usecols, - na_filter=True, - na_values=null_values, - keep_default_na=False, - skiprows=skiprows, - comment=comment, - decimal=decimal, - dtypes=schema, - nrows=n_rows, + options = ( + plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([path])) + .nrows(n_rows) + .skiprows(skiprows) + .lineterminator(str(eol)) + .quotechar(str(quote)) + .decimal(decimal) + .keep_default_na(keep_default_na=False) + .na_filter(na_filter=True) + .build() ) + options.set_delimiter(str(sep)) + if column_names is not None: + options.set_names([str(name) for name in column_names]) + options.set_header(header) + options.set_dtypes(schema) + if usecols is not None: + options.set_use_cols_names([str(name) for name in usecols]) + options.set_na_values(null_values) + if comment is not None: + options.set_comment(comment) + tbl_w_meta = plc.io.csv.read_csv(options) pieces.append(tbl_w_meta) if read_partial: n_rows -= tbl_w_meta.tbl.num_rows() diff --git a/python/pylibcudf/pylibcudf/io/csv.pxd b/python/pylibcudf/pylibcudf/io/csv.pxd index f04edaa316a..95f3ff4fe45 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pxd +++ b/python/pylibcudf/pylibcudf/io/csv.pxd @@ -6,11 +6,63 @@ from libcpp cimport bool from pylibcudf.libcudf.io.csv cimport ( csv_writer_options, csv_writer_options_builder, + csv_reader_options, + csv_reader_options_builder, ) -from pylibcudf.libcudf.io.types cimport quote_style -from pylibcudf.io.types cimport SinkInfo +from pylibcudf.io.types cimport SinkInfo, SourceInfo, TableWithMetadata from pylibcudf.table cimport Table +from pylibcudf.libcudf.io.types cimport ( + compression_type, + quote_style, + table_with_metadata, +) +from pylibcudf.libcudf.types cimport size_type + +cdef class CsvReaderOptions: + cdef csv_reader_options c_obj + cdef SourceInfo source + cpdef void set_header(self, size_type header) + cpdef void set_names(self, list col_names) + cpdef void set_prefix(self, str prefix) + cpdef void set_use_cols_indexes(self, list col_indices) + cpdef void set_use_cols_names(self, list col_names) + cpdef void set_delimiter(self, str delimiter) + cpdef void set_thousands(self, str thousands) + cpdef void set_comment(self, str comment) + cpdef void set_parse_dates(self, list val) + cpdef void set_parse_hex(self, list val) + cpdef void set_dtypes(self, object types) + cpdef void set_true_values(self, list true_values) + cpdef void set_false_values(self, list false_values) + cpdef void set_na_values(self, list na_values) + + +cdef class CsvReaderOptionsBuilder: + cdef csv_reader_options_builder c_obj + cdef SourceInfo source + cpdef CsvReaderOptionsBuilder compression(self, compression_type compression) + cpdef CsvReaderOptionsBuilder mangle_dupe_cols(self, bool mangle_dupe_cols) + cpdef CsvReaderOptionsBuilder byte_range_offset(self, size_t byte_range_offset) + cpdef CsvReaderOptionsBuilder byte_range_size(self, size_t byte_range_size) + cpdef CsvReaderOptionsBuilder nrows(self, size_type nrows) + cpdef CsvReaderOptionsBuilder skiprows(self, size_type skiprows) + cpdef CsvReaderOptionsBuilder skipfooter(self, size_type skipfooter) + cpdef CsvReaderOptionsBuilder quoting(self, quote_style quoting) + cpdef CsvReaderOptionsBuilder lineterminator(self, str lineterminator) + cpdef CsvReaderOptionsBuilder quotechar(self, str quotechar) + cpdef CsvReaderOptionsBuilder decimal(self, str decimal) + cpdef CsvReaderOptionsBuilder delim_whitespace(self, bool delim_whitespace) + cpdef CsvReaderOptionsBuilder skipinitialspace(self, bool skipinitialspace) + cpdef CsvReaderOptionsBuilder skip_blank_lines(self, bool skip_blank_lines) + cpdef CsvReaderOptionsBuilder doublequote(self, bool doublequote) + cpdef CsvReaderOptionsBuilder keep_default_na(self, bool keep_default_na) + cpdef CsvReaderOptionsBuilder na_filter(self, bool na_filter) + cpdef CsvReaderOptionsBuilder dayfirst(self, bool dayfirst) + cpdef CsvReaderOptions build(self) + +cpdef TableWithMetadata read_csv(CsvReaderOptions options) + cdef class CsvWriterOptions: cdef csv_writer_options c_obj cdef Table table diff --git a/python/pylibcudf/pylibcudf/io/csv.pyi b/python/pylibcudf/pylibcudf/io/csv.pyi index 583b66bc29c..540cbc778ea 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pyi +++ b/python/pylibcudf/pylibcudf/io/csv.pyi @@ -2,6 +2,8 @@ from collections.abc import Mapping +from typing_extensions import Self + from pylibcudf.io.types import ( CompressionType, QuoteStyle, @@ -12,6 +14,47 @@ from pylibcudf.io.types import ( from pylibcudf.table import Table from pylibcudf.types import DataType +class CsvReaderOptions: + def __init__(self): ... + def set_header(self, header: int): ... + def set_names(self, col_names: list[str]): ... + def set_prefix(self, prefix: str): ... + def set_use_cols_indexes(self, col_indices: list[int]): ... + def set_use_cols_names(self, col_names: list[str]): ... + def set_delimiter(self, delimiter: str): ... + def set_thousands(self, thousands: str): ... + def set_comment(self, comment: str): ... + def set_parse_dates(self, val: list[int | str]): ... + def set_parse_hex(self, val: list[int | str]): ... + def set_dtypes(self, types: dict[str, DataType] | list[DataType]): ... + def set_true_values(self, true_values: list[str]): ... + def set_false_values(self, false_values: list[str]): ... + def set_na_values(self, na_values: list[str]): ... + @staticmethod + def builder(source: SourceInfo) -> CsvReaderOptionsBuilder: ... + +class CsvReaderOptionsBuilder: + def __init__(self): ... + def compression(self, compression: CompressionType) -> Self: ... + def mangle_dupe_cols(self, mangle_dupe_cols: bool) -> Self: ... + def byte_range_offset(self, byte_range_offset: int) -> Self: ... + def byte_range_size(self, byte_range_size: int) -> Self: ... + def nrows(self, nrows: int) -> Self: ... + def skiprows(self, skiprows: int) -> Self: ... + def skipfooter(self, skipfooter: int) -> Self: ... + def quoting(self, quoting: QuoteStyle) -> Self: ... + def lineterminator(self, lineterminator: str) -> Self: ... + def quotechar(self, quotechar: str) -> Self: ... + def decimal(self, decimal: str) -> Self: ... + def delim_whitespace(self, delim_whitespace: bool) -> Self: ... + def skipinitialspace(self, skipinitialspace: bool) -> Self: ... + def skip_blank_lines(self, skip_blank_lines: bool) -> Self: ... + def doublequote(self, doublequote: bool) -> Self: ... + def keep_default_na(self, keep_default_na: bool) -> Self: ... + def na_filter(self, na_filter: bool) -> Self: ... + def dayfirst(self, dayfirst: bool) -> Self: ... + def build(self) -> CsvReaderOptions: ... + def read_csv( source_info: SourceInfo, *, @@ -54,7 +97,7 @@ def read_csv( # detect_whitespace_around_quotes: bool = False, # timestamp_type: DataType = DataType(type_id.EMPTY), ) -> TableWithMetadata: ... -def write_csv(options: CsvWriterOptionsBuilder) -> None: ... +def write_csv(options: CsvWriterOptionsBuilder): ... class CsvWriterOptions: def __init__(self): ... @@ -63,14 +106,12 @@ class CsvWriterOptions: class CsvWriterOptionsBuilder: def __init__(self): ... - def names(self, names: list) -> CsvWriterOptionsBuilder: ... - def na_rep(self, val: str) -> CsvWriterOptionsBuilder: ... - def include_header(self, val: bool) -> CsvWriterOptionsBuilder: ... - def rows_per_chunk(self, val: int) -> CsvWriterOptionsBuilder: ... - def line_terminator(self, term: str) -> CsvWriterOptionsBuilder: ... - def inter_column_delimiter( - self, delim: str - ) -> CsvWriterOptionsBuilder: ... - def true_value(self, val: str) -> CsvWriterOptionsBuilder: ... - def false_value(self, val: str) -> CsvWriterOptionsBuilder: ... + def names(self, names: list) -> Self: ... + def na_rep(self, val: str) -> Self: ... + def include_header(self, val: bool) -> Self: ... + def rows_per_chunk(self, val: int) -> Self: ... + def line_terminator(self, term: str) -> Self: ... + def inter_column_delimiter(self, delim: str) -> Self: ... + def true_value(self, val: str) -> Self: ... + def false_value(self, val: str) -> Self: ... def build(self) -> CsvWriterOptions: ... diff --git a/python/pylibcudf/pylibcudf/io/csv.pyx b/python/pylibcudf/pylibcudf/io/csv.pyx index 8be391de2c2..efc9bb813a1 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pyx +++ b/python/pylibcudf/pylibcudf/io/csv.pyx @@ -28,252 +28,628 @@ __all__ = [ "write_csv", "CsvWriterOptions", "CsvWriterOptionsBuilder", + "CsvReaderOptions", + "CsvReaderOptionsBuilder", ] -cdef tuple _process_parse_dates_hex(list cols): - cdef vector[string] str_cols - cdef vector[int] int_cols - for col in cols: - if isinstance(col, str): - str_cols.push_back(col.encode()) +cdef class CsvReaderOptions: + """The settings to use for ``read_csv`` + For details, see :cpp:class:`cudf::io::csv_reader_options` + """ + @staticmethod + def builder(SourceInfo source): + """ + Create a CsvWriterOptionsBuilder object + + For details, see :cpp:func:`cudf::io::csv_reader_options::builder` + + Parameters + ---------- + sink : SourceInfo + The source to read the CSV file from. + + Returns + ------- + CsvReaderOptionsBuilder + Builder to build CsvReaderOptions + """ + cdef CsvReaderOptionsBuilder csv_builder = CsvReaderOptionsBuilder.__new__( + CsvReaderOptionsBuilder + ) + csv_builder.c_obj = csv_reader_options.builder(source.c_obj) + csv_builder.source = source + return csv_builder + + cpdef void set_header(self, size_type header): + """ + Sets header row index. + + Parameters + ---------- + header : size_type + Index where header row is located + + Returns + ------- + None + """ + self.c_obj.set_header(header) + + cpdef void set_names(self, list col_names): + """ + Sets names of the column. + + Parameters + ---------- + col_names : list[str] + List of column names + + Returns + ------- + None + """ + cdef vector[string] vec + for name in col_names: + vec.push_back(name.encode()) + self.c_obj.set_names(vec) + + cpdef void set_prefix(self, str prefix): + """ + Sets prefix to be used for column ID. + + Parameters + ---------- + prefix : str + String used as prefix in for each column name + + Returns + ------- + None + """ + self.c_obj.set_prefix(prefix.encode()) + + cpdef void set_use_cols_indexes(self, list col_indices): + """ + Sets indexes of columns to read. + + Parameters + ---------- + col_indices : list[int] + List of column indices that are needed + + Returns + ------- + None + """ + cdef vector[int] vec + for i in col_indices: + vec.push_back(i) + self.c_obj.set_use_cols_indexes(vec) + + cpdef void set_use_cols_names(self, list col_names): + """ + Sets names of the columns to be read. + + Parameters + ---------- + col_names : list[str] + List of column indices that are needed + + Returns + ------- + None + """ + cdef vector[string] vec + for name in col_names: + vec.push_back(name.encode()) + self.c_obj.set_use_cols_names(vec) + + cpdef void set_delimiter(self, str delimiter): + """ + Sets field delimiter. + + Parameters + ---------- + delimiter : str + A character to indicate delimiter + + Returns + ------- + None + """ + self.c_obj.set_delimiter(ord(delimiter)) + + cpdef void set_thousands(self, str thousands): + """ + Sets numeric data thousands separator. + + Parameters + ---------- + thousands : str + A character that separates thousands + + Returns + ------- + None + """ + self.c_obj.set_thousands(ord(thousands)) + + cpdef void set_comment(self, str comment): + """ + Sets comment line start character. + + Parameters + ---------- + comment : str + A character that indicates comment + + Returns + ------- + None + """ + self.c_obj.set_comment(ord(comment)) + + cpdef void set_parse_dates(self, list val): + """ + Sets indexes or names of columns to read as datetime. + + Parameters + ---------- + val : list[int | str] + List column indices or names to infer as datetime. + + Returns + ------- + None + """ + cdef vector[string] vec_str + cdef vector[int] vec_int + if not all([isinstance(col, (str, int)) for col in val]): + raise TypeError("Must be a list of int or str") + else: + for date in val: + if isinstance(date, str): + vec_str.push_back(date.encode()) + else: + vec_int.push_back(date) + self.c_obj.set_parse_dates(vec_str) + self.c_obj.set_parse_dates(vec_int) + + cpdef void set_parse_hex(self, list val): + """ + Sets indexes or names of columns to parse as hexadecimal. + + Parameters + ---------- + val : list[int | str] + List of column indices or names to parse as hexadecimal + + Returns + ------- + None + """ + cdef vector[string] vec_str + cdef vector[int] vec_int + if not all([isinstance(col, (str, int)) for col in val]): + raise TypeError("Must be a list of int or str") else: - int_cols.push_back(col) - return str_cols, int_cols - -cdef vector[string] _make_str_vector(list vals): - cdef vector[string] res - for val in vals: - res.push_back((val).encode()) - return res - - -def read_csv( - SourceInfo source_info, - *, - compression_type compression = compression_type.AUTO, - size_t byte_range_offset = 0, - size_t byte_range_size = 0, - list col_names = None, - str prefix = "", - bool mangle_dupe_cols = True, - list usecols = None, - size_type nrows = -1, - size_type skiprows = 0, - size_type skipfooter = 0, - size_type header = 0, - str lineterminator = "\n", - str delimiter = None, - str thousands = None, - str decimal = ".", - str comment = None, - bool delim_whitespace = False, - bool skipinitialspace = False, - bool skip_blank_lines = True, - quote_style quoting = quote_style.MINIMAL, - str quotechar = '"', - bool doublequote = True, - list parse_dates = None, - list parse_hex = None, - # Technically this should be dict/list - # but using a fused type prevents using None as default - object dtypes = None, - list true_values = None, - list false_values = None, - list na_values = None, - bool keep_default_na = True, - bool na_filter = True, - bool dayfirst = False, - # Note: These options are supported by the libcudf reader - # but are not exposed here since there is no demand for them - # on the Python side yet. - # bool detect_whitespace_around_quotes = False, - # DataType timestamp_type = DataType(type_id.EMPTY), + for hx in val: + if isinstance(hx, str): + vec_str.push_back(hx.encode()) + else: + vec_int.push_back(hx) + + self.c_obj.set_parse_hex(vec_str) + self.c_obj.set_parse_hex(vec_int) + + cpdef void set_dtypes(self, object types): + """ + Sets per-column types. + + Parameters + ---------- + types : dict[str, data_type] | list[data_type] + Column name to data type map specifying the columns' target data types. + Or a list specifying the columns' target data types. + + Returns + ------- + None + """ + cdef map[string, data_type] dtype_map + cdef vector[data_type] dtype_list + if isinstance(types, dict): + for name, dtype in types.items(): + dtype_map[str(name).encode()] = (dtype).c_obj + self.c_obj.set_dtypes(dtype_map) + elif isinstance(types, list): + for dtype in types: + dtype_list.push_back((dtype).c_obj) + self.c_obj.set_dtypes(dtype_list) + else: + raise TypeError("Must pass an dict or list") + + cpdef void set_true_values(self, list true_values): + """ + Sets additional values to recognize as boolean true values. + + Parameters + ---------- + true_values : list[str] + List of values to be considered to be true + + Returns + ------- + None + """ + cdef vector[string] vec + for val in true_values: + vec.push_back(val.encode()) + self.c_obj.set_true_values(vec) + + cpdef void set_false_values(self, list false_values): + """ + Sets additional values to recognize as boolean false values. + + Parameters + ---------- + false_values : list[str] + List of values to be considered to be false + + Returns + ------- + None + """ + cdef vector[string] vec + for val in false_values: + vec.push_back(val.encode()) + self.c_obj.set_false_values(vec) + + cpdef void set_na_values(self, list na_values): + """ + Sets additional values to recognize as null values. + + Parameters + ---------- + na_values : list[str] + List of values to be considered to be null + + Returns + ------- + None + """ + cdef vector[string] vec + for val in na_values: + vec.push_back(val.encode()) + self.c_obj.set_na_values(vec) + + +cdef class CsvReaderOptionsBuilder: + """ + Builder to build options for ``read_csv`` + + For details, see :cpp:class:`cudf::io::csv_reader_options_builder` + """ + cpdef CsvReaderOptionsBuilder compression(self, compression_type compression): + """ + Sets compression format of the source. + + Parameters + ---------- + compression : compression_type + Compression type + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.compression(compression) + return self + + cpdef CsvReaderOptionsBuilder mangle_dupe_cols(self, bool mangle_dupe_cols): + """ + Sets whether to rename duplicate column names. + + Parameters + ---------- + mangle_dupe_cols : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.mangle_dupe_cols(mangle_dupe_cols) + return self + + cpdef CsvReaderOptionsBuilder byte_range_offset(self, size_t byte_range_offset): + """ + Sets number of bytes to skip from source start. + + Parameters + ---------- + byte_range_offset : size_t + Number of bytes of offset + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.byte_range_offset(byte_range_offset) + return self + + cpdef CsvReaderOptionsBuilder byte_range_size(self, size_t byte_range_size): + """ + Sets number of bytes to read. + + Parameters + ---------- + byte_range_offset : size_t + Number of bytes to read + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.byte_range_size(byte_range_size) + return self + + cpdef CsvReaderOptionsBuilder nrows(self, size_type nrows): + """ + Sets number of rows to read. + + Parameters + ---------- + nrows : size_type + Number of rows to read + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.nrows(nrows) + return self + + cpdef CsvReaderOptionsBuilder skiprows(self, size_type skiprows): + """ + Sets number of rows to skip from start. + + Parameters + ---------- + skiprows : size_type + Number of rows to skip + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.skiprows(skiprows) + return self + + cpdef CsvReaderOptionsBuilder skipfooter(self, size_type skipfooter): + """ + Sets number of rows to skip from end. + + Parameters + ---------- + skipfooter : size_type + Number of rows to skip + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.skipfooter(skipfooter) + return self + + cpdef CsvReaderOptionsBuilder quoting(self, quote_style quoting): + """ + Sets quoting style. + + Parameters + ---------- + quoting : quote_style + Quoting style used + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.quoting(quoting) + return self + + cpdef CsvReaderOptionsBuilder lineterminator(self, str lineterminator): + """ + Sets line terminator. + + Parameters + ---------- + quoting : str + A character to indicate line termination + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.lineterminator(ord(lineterminator)) + return self + + cpdef CsvReaderOptionsBuilder quotechar(self, str quotechar): + """ + Sets quoting character. + + Parameters + ---------- + quotechar : str + A character to indicate quoting + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.quotechar(ord(quotechar)) + return self + + cpdef CsvReaderOptionsBuilder decimal(self, str decimal): + """ + Sets decimal point character. + + Parameters + ---------- + quotechar : str + A character that indicates decimal values + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.decimal(ord(decimal)) + return self + + cpdef CsvReaderOptionsBuilder delim_whitespace(self, bool delim_whitespace): + """ + Sets whether to treat whitespace as field delimiter. + + Parameters + ---------- + delim_whitespace : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.delim_whitespace(delim_whitespace) + return self + + cpdef CsvReaderOptionsBuilder skipinitialspace(self, bool skipinitialspace): + """ + Sets whether to skip whitespace after the delimiter. + + Parameters + ---------- + skipinitialspace : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.skipinitialspace(skipinitialspace) + return self + + cpdef CsvReaderOptionsBuilder skip_blank_lines(self, bool skip_blank_lines): + """ + Sets whether to ignore empty lines or parse line values as invalid. + + Parameters + ---------- + skip_blank_lines : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.skip_blank_lines(skip_blank_lines) + return self + + cpdef CsvReaderOptionsBuilder doublequote(self, bool doublequote): + """ + Sets a quote inside a value is double-quoted. + + Parameters + ---------- + doublequote : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.doublequote(doublequote) + return self + + cpdef CsvReaderOptionsBuilder keep_default_na(self, bool keep_default_na): + """ + Sets whether to keep the built-in default NA values. + + Parameters + ---------- + keep_default_na : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.keep_default_na(keep_default_na) + return self + + cpdef CsvReaderOptionsBuilder na_filter(self, bool na_filter): + """ + Sets whether to disable null filter. + + Parameters + ---------- + na_filter : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.na_filter(na_filter) + return self + + cpdef CsvReaderOptionsBuilder dayfirst(self, bool dayfirst): + """ + Sets whether to parse dates as DD/MM versus MM/DD. + + Parameters + ---------- + dayfirst : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ + self.c_obj.dayfirst(dayfirst) + return self + + cpdef CsvReaderOptions build(self): + """Create a CsvReaderOptions object""" + cdef CsvReaderOptions csv_options = CsvReaderOptions.__new__( + CsvReaderOptions + ) + csv_options.c_obj = move(self.c_obj.build()) + csv_options.source = self.source + return csv_options + + +cpdef TableWithMetadata read_csv( + CsvReaderOptions options ): - """Reads a CSV file into a :py:class:`~.types.TableWithMetadata`. + """ + Read from CSV format. + + The source to read from and options are encapsulated + by the `options` object. For details, see :cpp:func:`read_csv`. Parameters ---------- - source_info : SourceInfo - The SourceInfo to read the CSV file from. - compression : compression_type, default CompressionType.AUTO - The compression format of the CSV source. - byte_range_offset : size_type, default 0 - Number of bytes to skip from source start. - byte_range_size : size_type, default 0 - Number of bytes to read. By default, will read all bytes. - col_names : list, default None - The column names to use. - prefix : string, default '' - The prefix to apply to the column names. - mangle_dupe_cols : bool, default True - If True, rename duplicate column names. - usecols : list, default None - Specify the string column names/integer column indices of columns to be read. - nrows : size_type, default -1 - The number of rows to read. - skiprows : size_type, default 0 - The number of rows to skip from the start before reading - skipfooter : size_type, default 0 - The number of rows to skip from the end - header : size_type, default 0 - The index of the row that will be used for header names. - Pass -1 to use default column names. - lineterminator : str, default '\\n' - The character used to determine the end of a line. - delimiter : str, default "," - The character used to separate fields in a row. - thousands : str, default None - The character used as the thousands separator. - Cannot match delimiter. - decimal : str, default '.' - The character used as the decimal separator. - Cannot match delimiter. - comment : str, default None - The character used to identify the start of a comment line. - (which will be skipped by the reader) - delim_whitespace : bool, default False - If True, treat whitespace as the field delimiter. - skipinitialspace : bool, default False - If True, skip whitespace after the delimiter. - skip_blank_lines : bool, default True - If True, ignore empty lines (otherwise line values are parsed as null). - quoting : QuoteStyle, default QuoteStyle.MINIMAL - The quoting style used in the input CSV data. One of - { QuoteStyle.MINIMAL, QuoteStyle.ALL, QuoteStyle.NONNUMERIC, QuoteStyle.NONE } - quotechar : str, default '"' - The character used to indicate quoting. - doublequote : bool, default True - If True, a quote inside a value is double-quoted. - parse_dates : list, default None - A list of integer column indices/string column names - of columns to read as datetime. - parse_hex : list, default None - A list of integer column indices/string column names - of columns to read as hexadecimal. - dtypes : Union[Dict[str, DataType], List[DataType]], default None - A list of data types or a dictionary mapping column names - to a DataType. - true_values : List[str], default None - A list of additional values to recognize as True. - false_values : List[str], default None - A list of additional values to recognize as False. - na_values : List[str], default None - A list of additional values to recognize as null. - keep_default_na : bool, default True - Whether to keep the built-in default N/A values. - na_filter : bool, default True - Whether to detect missing values. If False, can - improve performance. - dayfirst : bool, default False - If True, interpret dates as being in the DD/MM format. - - Returns - ------- - TableWithMetadata - The Table and its corresponding metadata (column names) that were read in. + options: CsvReaderOptions + Settings for controlling reading behavior """ - cdef vector[string] c_parse_dates_names - cdef vector[int] c_parse_dates_indexes - cdef vector[int] c_parse_hex_names - cdef vector[int] c_parse_hex_indexes - cdef vector[data_type] c_dtypes_list - cdef map[string, data_type] c_dtypes_map - - cdef csv_reader_options options = ( - csv_reader_options.builder(source_info.c_obj) - .compression(compression) - .mangle_dupe_cols(mangle_dupe_cols) - .byte_range_offset(byte_range_offset) - .byte_range_size(byte_range_size) - .nrows(nrows) - .skiprows(skiprows) - .skipfooter(skipfooter) - .quoting(quoting) - .lineterminator(ord(lineterminator)) - .quotechar(ord(quotechar)) - .decimal(ord(decimal)) - .delim_whitespace(delim_whitespace) - .skipinitialspace(skipinitialspace) - .skip_blank_lines(skip_blank_lines) - .doublequote(doublequote) - .keep_default_na(keep_default_na) - .na_filter(na_filter) - .dayfirst(dayfirst) - .build() - ) - - options.set_header(header) - - if col_names is not None: - options.set_names([str(name).encode() for name in col_names]) - - if prefix is not None: - options.set_prefix(prefix.encode()) - - if usecols is not None: - if all([isinstance(col, int) for col in usecols]): - options.set_use_cols_indexes(list(usecols)) - else: - options.set_use_cols_names([str(name).encode() for name in usecols]) - - if delimiter is not None: - options.set_delimiter(ord(delimiter)) - - if thousands is not None: - options.set_thousands(ord(thousands)) - - if comment is not None: - options.set_comment(ord(comment)) - - if parse_dates is not None: - if not all([isinstance(col, (str, int)) for col in parse_dates]): - raise NotImplementedError( - "`parse_dates`: Must pass a list of column names/indices") - - # Set both since users are allowed to mix column names and indices - c_parse_dates_names, c_parse_dates_indexes = \ - _process_parse_dates_hex(parse_dates) - options.set_parse_dates(c_parse_dates_names) - options.set_parse_dates(c_parse_dates_indexes) - - if parse_hex is not None: - if not all([isinstance(col, (str, int)) for col in parse_hex]): - raise NotImplementedError( - "`parse_hex`: Must pass a list of column names/indices") - - # Set both since users are allowed to mix column names and indices - c_parse_hex_names, c_parse_hex_indexes = _process_parse_dates_hex(parse_hex) - options.set_parse_hex(c_parse_hex_names) - options.set_parse_hex(c_parse_hex_indexes) - - if isinstance(dtypes, list): - for dtype in dtypes: - c_dtypes_list.push_back((dtype).c_obj) - options.set_dtypes(c_dtypes_list) - elif isinstance(dtypes, dict): - # dtypes_t is dict - for k, v in dtypes.items(): - c_dtypes_map[str(k).encode()] = (v).c_obj - options.set_dtypes(c_dtypes_map) - elif dtypes is not None: - raise TypeError("dtypes must either by a list/dict") - - if true_values is not None: - options.set_true_values(_make_str_vector(true_values)) - - if false_values is not None: - options.set_false_values(_make_str_vector(false_values)) - - if na_values is not None: - options.set_na_values(_make_str_vector(na_values)) - cdef table_with_metadata c_result with nogil: - c_result = move(cpp_read_csv(options)) + c_result = move(cpp_read_csv(options.c_obj)) - return TableWithMetadata.from_libcudf(c_result) + cdef TableWithMetadata tbl_meta = TableWithMetadata.from_libcudf(c_result) + return tbl_meta # TODO: Implement the remaining methods diff --git a/python/pylibcudf/pylibcudf/tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py index 90d2d0896a5..1cbaac57315 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_csv.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py @@ -77,14 +77,16 @@ def test_read_csv_basic( offset=skiprows, length=nrows if nrows != -1 else None ) - res = plc.io.csv.read_csv( - plc.io.SourceInfo([source]), - delimiter=delimiter, - compression=compression_type, - col_names=column_names, - nrows=nrows, - skiprows=skiprows, + options = ( + plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([source])) + .compression(compression_type) + .nrows(nrows) + .skiprows(skiprows) + .build() ) + options.set_delimiter(delimiter) + options.set_names([str(name) for name in column_names]) + res = plc.io.csv.read_csv(options) assert_table_and_meta_eq( pa_table, @@ -110,15 +112,15 @@ def test_read_csv_byte_range(table_data, chunk_size, tmp_path): file_size = os.stat(source).st_size tbls_w_meta = [] for segment in range((file_size + chunk_size - 1) // chunk_size): - tbls_w_meta.append( - plc.io.csv.read_csv( - plc.io.SourceInfo([source]), - byte_range_offset=segment * chunk_size, - byte_range_size=chunk_size, - header=-1, - col_names=pa_table.column_names, - ) + options = ( + plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([source])) + .byte_range_offset(segment * chunk_size) + .byte_range_size(chunk_size) + .build() ) + options.set_header(-1) + options.set_names([str(name) for name in pa_table.column_names]) + tbls_w_meta.append(plc.io.csv.read_csv(options)) if isinstance(source, io.IOBase): source.seek(0) exp = pd.read_csv(source, names=pa_table.column_names, header=None) @@ -161,9 +163,16 @@ def test_read_csv_dtypes(csv_table_data, source_or_sink, usecols): new_schema = pa.schema(new_fields) - res = plc.io.csv.read_csv( - plc.io.SourceInfo([source]), dtypes=dtypes, usecols=usecols - ) + options = plc.io.csv.CsvReaderOptions.builder( + plc.io.SourceInfo([source]) + ).build() + options.set_dtypes(dtypes) + if usecols is not None: + if all(isinstance(col, int) for col in usecols): + options.set_use_cols_indexes(list(usecols)) + else: + options.set_use_cols_names([str(name) for name in usecols]) + res = plc.io.csv.read_csv(options) new_table = pa_table.cast(new_schema) assert_table_and_meta_eq(new_table, res) @@ -171,7 +180,7 @@ def test_read_csv_dtypes(csv_table_data, source_or_sink, usecols): @pytest.mark.parametrize("skip_blanks", [True, False]) @pytest.mark.parametrize("decimal, quotechar", [(".", "'"), ("_", '"')]) -@pytest.mark.parametrize("lineterminator", ["\n", "\r\n"]) +@pytest.mark.parametrize("lineterminator", ["\n", "\t"]) def test_read_csv_parse_options( source_or_sink, decimal, quotechar, skip_blanks, lineterminator ): @@ -188,19 +197,25 @@ def test_read_csv_parse_options( write_source_str(source_or_sink, buffer) - plc_table_w_meta = plc.io.csv.read_csv( - plc.io.SourceInfo([source_or_sink]), - comment="#", - decimal=decimal, - skip_blank_lines=skip_blanks, - quotechar=quotechar, + options = ( + plc.io.csv.CsvReaderOptions.builder( + plc.io.SourceInfo([source_or_sink]) + ) + .lineterminator(lineterminator) + .quotechar(quotechar) + .decimal(decimal) + .skip_blank_lines(skip_blanks) + .build() ) + options.set_comment("#") + plc_table_w_meta = plc.io.csv.read_csv(options) df = pd.read_csv( StringIO(buffer), comment="#", decimal=decimal, skip_blank_lines=skip_blanks, quotechar=quotechar, + lineterminator=lineterminator, ) assert_table_and_meta_eq(pa.Table.from_pandas(df), plc_table_w_meta) @@ -216,12 +231,17 @@ def test_read_csv_na_values( write_source_str(source_or_sink, buffer) - plc_table_w_meta = plc.io.csv.read_csv( - plc.io.SourceInfo([source_or_sink]), - na_filter=na_filter, - na_values=na_values if na_filter else None, - keep_default_na=keep_default_na, + options = ( + plc.io.csv.CsvReaderOptions.builder( + plc.io.SourceInfo([source_or_sink]) + ) + .keep_default_na(keep_default_na) + .na_filter(na_filter) + .build() ) + if na_filter and na_values is not None: + options.set_na_values(na_values) + plc_table_w_meta = plc.io.csv.read_csv(options) df = pd.read_csv( StringIO(buffer), na_filter=na_filter, @@ -241,9 +261,11 @@ def test_read_csv_header(csv_table_data, source_or_sink, header): **_COMMON_CSV_SOURCE_KWARGS, ) - plc_table_w_meta = plc.io.csv.read_csv( - plc.io.SourceInfo([source]), header=header - ) + options = plc.io.csv.CsvReaderOptions.builder( + plc.io.SourceInfo([source]) + ).build() + options.set_header(header) + plc_table_w_meta = plc.io.csv.read_csv(options) if header > 0: if header < len(pa_table): names_row = pa_table.take([header - 1]).to_pylist()[0].values()