From 89945b6b50508809815381c9930f01c4516c55ea Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Thu, 21 Nov 2024 15:53:03 -0800 Subject: [PATCH 01/11] declare csv reader classes --- python/pylibcudf/pylibcudf/io/csv.pxd | 39 +++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/python/pylibcudf/pylibcudf/io/csv.pxd b/python/pylibcudf/pylibcudf/io/csv.pxd index f04edaa316a..45a06207464 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pxd +++ b/python/pylibcudf/pylibcudf/io/csv.pxd @@ -6,11 +6,46 @@ from libcpp cimport bool from pylibcudf.libcudf.io.csv cimport ( csv_writer_options, csv_writer_options_builder, + csv_reader_options, + csv_reader_options_builder, ) -from pylibcudf.libcudf.io.types cimport quote_style -from pylibcudf.io.types cimport SinkInfo +from pylibcudf.io.types cimport SinkInfo, SourceInfo from pylibcudf.table cimport Table +from pylibcudf.libcudf.io.types cimport ( + compression_type, + quote_style, + table_with_metadata, +) +from pylibcudf.libcudf.types cimport size_type + +cdef class CsvReaderOptions: + cdef csv_reader_options c_obj + cdef SourceInfo source + +cdef class CsvReaderOptionsBuilder: + cdef csv_reader_options_builder c_obj + cdef SourceInfo source + cdef CsvReaderOptionsBuilder compression(self, compression_type compression) + cdef CsvReaderOptionsBuilder mangle_dupe_cols(self, bool mangle_dupe_cols) + cdef CsvReaderOptionsBuilder byte_range_offset(self, size_t byte_range_offset) + cdef CsvReaderOptionsBuilder byte_range_size(self, size_t byte_range_size) + cdef CsvReaderOptionsBuilder nrows(self, size_type nrows) + cdef CsvReaderOptionsBuilder skiprows(self, size_type skiprows) + cdef CsvReaderOptionsBuilder skipfooter(self, size_type skipfooter) + cdef CsvReaderOptionsBuilder quoting(self, quote_style quoting) + cdef CsvReaderOptionsBuilder lineterminator(self, str lineterminator) + cdef CsvReaderOptionsBuilder quotechar(self, str quotechar) + cdef CsvReaderOptionsBuilder decimal(self, str decimal) + cdef CsvReaderOptionsBuilder delim_whitespace(self, bool delim_whitespace) + cdef CsvReaderOptionsBuilder skipinitialspace(self, bool skipinitialspace) + cdef CsvReaderOptionsBuilder skip_blank_lines(self, bool skip_blank_lines) + cdef CsvReaderOptionsBuilder doublequote(self, bool doublequote) + cdef CsvReaderOptionsBuilder keep_default_na(self, bool keep_default_na) + cdef CsvReaderOptionsBuilder na_filter(self, bool na_filter) + cdef CsvReaderOptionsBuilder dayfirst(self, bool dayfirst) + cdef CsvReaderOptionsBuilder build(self) + cdef class CsvWriterOptions: cdef csv_writer_options c_obj cdef Table table From 96e1d44db389e51b76d47e3379c6d1a29056b057 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Thu, 21 Nov 2024 18:56:19 -0800 Subject: [PATCH 02/11] add type stubs and empty impl --- python/pylibcudf/pylibcudf/io/csv.pxd | 57 ++++++++----- python/pylibcudf/pylibcudf/io/csv.pyi | 62 +++++++++++--- python/pylibcudf/pylibcudf/io/csv.pyx | 113 ++++++++++++++++++++++++++ 3 files changed, 202 insertions(+), 30 deletions(-) diff --git a/python/pylibcudf/pylibcudf/io/csv.pxd b/python/pylibcudf/pylibcudf/io/csv.pxd index 45a06207464..7387fb941f0 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pxd +++ b/python/pylibcudf/pylibcudf/io/csv.pxd @@ -19,32 +19,51 @@ from pylibcudf.libcudf.io.types cimport ( ) from pylibcudf.libcudf.types cimport size_type +ctypedef fused DictOrList: + dict + list + cdef class CsvReaderOptions: cdef csv_reader_options c_obj cdef SourceInfo source + cpdef void set_header(size_type header) + cpdef void set_names(list col_names) + cpdef void set_prefix(str prefix) + cpdef void set_use_cols_indexes(list col_indices) + cpdef void set_use_cols_names(list col_names) + cpdef void set_delimiter(str delimiter) + cpdef void set_thousands(str thousands) + cpdef void set_comment(str comment) + cpdef void set_parse_dates(list val) + cpdef void set_parse_hex(list val) + cpdef void set_dtypes(DictOrList types) + cpdef void set_true_values(list true_values) + cpdef void set_false_values(list false_values) + cpdef void set_na_values(list na_values) + cdef class CsvReaderOptionsBuilder: cdef csv_reader_options_builder c_obj cdef SourceInfo source - cdef CsvReaderOptionsBuilder compression(self, compression_type compression) - cdef CsvReaderOptionsBuilder mangle_dupe_cols(self, bool mangle_dupe_cols) - cdef CsvReaderOptionsBuilder byte_range_offset(self, size_t byte_range_offset) - cdef CsvReaderOptionsBuilder byte_range_size(self, size_t byte_range_size) - cdef CsvReaderOptionsBuilder nrows(self, size_type nrows) - cdef CsvReaderOptionsBuilder skiprows(self, size_type skiprows) - cdef CsvReaderOptionsBuilder skipfooter(self, size_type skipfooter) - cdef CsvReaderOptionsBuilder quoting(self, quote_style quoting) - cdef CsvReaderOptionsBuilder lineterminator(self, str lineterminator) - cdef CsvReaderOptionsBuilder quotechar(self, str quotechar) - cdef CsvReaderOptionsBuilder decimal(self, str decimal) - cdef CsvReaderOptionsBuilder delim_whitespace(self, bool delim_whitespace) - cdef CsvReaderOptionsBuilder skipinitialspace(self, bool skipinitialspace) - cdef CsvReaderOptionsBuilder skip_blank_lines(self, bool skip_blank_lines) - cdef CsvReaderOptionsBuilder doublequote(self, bool doublequote) - cdef CsvReaderOptionsBuilder keep_default_na(self, bool keep_default_na) - cdef CsvReaderOptionsBuilder na_filter(self, bool na_filter) - cdef CsvReaderOptionsBuilder dayfirst(self, bool dayfirst) - cdef CsvReaderOptionsBuilder build(self) + cpdef CsvReaderOptionsBuilder compression(self, compression_type compression) + cpdef CsvReaderOptionsBuilder mangle_dupe_cols(self, bool mangle_dupe_cols) + cpdef CsvReaderOptionsBuilder byte_range_offset(self, size_t byte_range_offset) + cpdef CsvReaderOptionsBuilder byte_range_size(self, size_t byte_range_size) + cpdef CsvReaderOptionsBuilder nrows(self, size_type nrows) + cpdef CsvReaderOptionsBuilder skiprows(self, size_type skiprows) + cpdef CsvReaderOptionsBuilder skipfooter(self, size_type skipfooter) + cpdef CsvReaderOptionsBuilder quoting(self, quote_style quoting) + cpdef CsvReaderOptionsBuilder lineterminator(self, str lineterminator) + cpdef CsvReaderOptionsBuilder quotechar(self, str quotechar) + cpdef CsvReaderOptionsBuilder decimal(self, str decimal) + cpdef CsvReaderOptionsBuilder delim_whitespace(self, bool delim_whitespace) + cpdef CsvReaderOptionsBuilder skipinitialspace(self, bool skipinitialspace) + cpdef CsvReaderOptionsBuilder skip_blank_lines(self, bool skip_blank_lines) + cpdef CsvReaderOptionsBuilder doublequote(self, bool doublequote) + cpdef CsvReaderOptionsBuilder keep_default_na(self, bool keep_default_na) + cpdef CsvReaderOptionsBuilder na_filter(self, bool na_filter) + cpdef CsvReaderOptionsBuilder dayfirst(self, bool dayfirst) + cpdef CsvReaderOptions build(self) cdef class CsvWriterOptions: cdef csv_writer_options c_obj diff --git a/python/pylibcudf/pylibcudf/io/csv.pyi b/python/pylibcudf/pylibcudf/io/csv.pyi index 583b66bc29c..adfd29eb530 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pyi +++ b/python/pylibcudf/pylibcudf/io/csv.pyi @@ -1,6 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from collections.abc import Mapping +from typing import Self from pylibcudf.io.types import ( CompressionType, @@ -12,6 +13,47 @@ from pylibcudf.io.types import ( from pylibcudf.table import Table from pylibcudf.types import DataType +class CsvReaderOptions: + def __init__(self): ... + def set_header(self, header: int): ... + def set_names(self, col_names: list[str]): ... + def set_prefix(self, prefix: str): ... + def set_use_cols_indexes(self, col_indices: list[int]): ... + def set_use_cols_names(self, col_names: list[str]): ... + def set_delimiter(self, delimiter: str): ... + def set_thousands(self, thousands: str): ... + def set_comment(self, comment: str): ... + def set_parse_dates(self, val: list[int | str]): ... + def set_parse_hex(self, val: list[int | str]): ... + def set_dtypes(self, types: dict[str, DataType] | list[DataType]): ... + def set_true_values(self, true_values: list[str]): ... + def set_false_values(self, false_values: list[str]): ... + def set_na_values(self, na_values: list[str]): ... + @staticmethod + def builder(source: SourceInfo) -> CsvReaderOptionsBuilder: ... + +class CsvReaderOptionsBuilder: + def __init__(self): ... + def compression(self, compression: CompressionType) -> Self: ... + def mangle_dupe_cols(self, mangle_dupe_cols: bool) -> Self: ... + def byte_range_offset(self, byte_range_offset: int) -> Self: ... + def byte_range_size(self, byte_range_size: int) -> Self: ... + def nrows(self, nrows: int) -> Self: ... + def skiprows(self, skiprows: int) -> Self: ... + def skipfooter(self, skipfooter: int) -> Self: ... + def quoting(self, quoting: QuoteStyle) -> Self: ... + def lineterminator(self, lineterminator: str) -> Self: ... + def quotechar(self, quotechar: str) -> Self: ... + def decimal(self, decimal: str) -> Self: ... + def delim_whitespace(self, delim_whitespace: bool) -> Self: ... + def skipinitialspace(self, skipinitialspace: bool) -> Self: ... + def skip_blank_lines(self, skip_blank_lines: bool) -> Self: ... + def doublequote(self, doublequote: bool) -> Self: ... + def keep_default_na(self, keep_default_na: bool) -> Self: ... + def na_filter(self, na_filter: bool) -> Self: ... + def dayfirst(self, dayfirst: bool) -> Self: ... + def build(self) -> CsvReaderOptions: ... + def read_csv( source_info: SourceInfo, *, @@ -54,7 +96,7 @@ def read_csv( # detect_whitespace_around_quotes: bool = False, # timestamp_type: DataType = DataType(type_id.EMPTY), ) -> TableWithMetadata: ... -def write_csv(options: CsvWriterOptionsBuilder) -> None: ... +def write_csv(options: CsvWriterOptionsBuilder): ... class CsvWriterOptions: def __init__(self): ... @@ -63,14 +105,12 @@ class CsvWriterOptions: class CsvWriterOptionsBuilder: def __init__(self): ... - def names(self, names: list) -> CsvWriterOptionsBuilder: ... - def na_rep(self, val: str) -> CsvWriterOptionsBuilder: ... - def include_header(self, val: bool) -> CsvWriterOptionsBuilder: ... - def rows_per_chunk(self, val: int) -> CsvWriterOptionsBuilder: ... - def line_terminator(self, term: str) -> CsvWriterOptionsBuilder: ... - def inter_column_delimiter( - self, delim: str - ) -> CsvWriterOptionsBuilder: ... - def true_value(self, val: str) -> CsvWriterOptionsBuilder: ... - def false_value(self, val: str) -> CsvWriterOptionsBuilder: ... + def names(self, names: list) -> Self: ... + def na_rep(self, val: str) -> Self: ... + def include_header(self, val: bool) -> Self: ... + def rows_per_chunk(self, val: int) -> Self: ... + def line_terminator(self, term: str) -> Self: ... + def inter_column_delimiter(self, delim: str) -> Self: ... + def true_value(self, val: str) -> Self: ... + def false_value(self, val: str) -> Self: ... def build(self) -> CsvWriterOptions: ... diff --git a/python/pylibcudf/pylibcudf/io/csv.pyx b/python/pylibcudf/pylibcudf/io/csv.pyx index 8be391de2c2..f9ba69462ba 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pyx +++ b/python/pylibcudf/pylibcudf/io/csv.pyx @@ -28,6 +28,8 @@ __all__ = [ "write_csv", "CsvWriterOptions", "CsvWriterOptionsBuilder", + "CsvReaderOptions", + "CsvReaderOptionsBuilder", ] cdef tuple _process_parse_dates_hex(list cols): @@ -47,6 +49,117 @@ cdef vector[string] _make_str_vector(list vals): return res +cdef class CsvReaderOptions: + @staticmethod + def builder(SourceInfo source): + cdef CsvReaderOptionsBuilder csv_builder = CsvReaderOptionsBuilder.__new__( + CsvReaderOptionsBuilder + ) + csv_builder.c_obj = csv_writer_options.builder(source.c_obj) + csv_builder.source = source + + cpdef void set_header(size_type header): + self.c_obj.set_header(header) + + cpdef void set_names(list col_names): + pass + + cpdef void set_prefix(str prefix): + pass + + cpdef void set_use_cols_indexes(list col_indices): + pass + + cpdef void set_use_cols_names(list col_names): + pass + + cpdef void set_delimiter(str delimiter): + pass + + cpdef void set_thousands(str thousands): + pass + + cpdef void set_comment(str comment): + pass + + cpdef void set_parse_dates(list val): + pass + + cpdef void set_parse_hex(list val): + pass + + cpdef void set_dtypes(DictOrList types): + pass + + cpdef void set_true_values(list true_values): + pass + + cpdef void set_false_values(list false_values): + pass + + cpdef void set_na_values(list na_values): + pass + + +cdef class CsvReaderOptionsBuilder: + cdef CsvReaderOptionsBuilder compression(self, compression_type compression): + return self.c_obj.compression() + + cdef CsvReaderOptionsBuilder mangle_dupe_cols(self, bool mangle_dupe_cols): + pass + + cdef CsvReaderOptionsBuilder byte_range_offset(self, size_t byte_range_offset): + pass + + cdef CsvReaderOptionsBuilder byte_range_size(self, size_t byte_range_size): + pass + + cdef CsvReaderOptionsBuilder nrows(self, size_type nrows): + pass + + cdef CsvReaderOptionsBuilder skiprows(self, size_type skiprows): + pass + + cdef CsvReaderOptionsBuilder skipfooter(self, size_type skipfooter): + pass + + cdef CsvReaderOptionsBuilder quoting(self, quote_style quoting): + pass + + cdef CsvReaderOptionsBuilder lineterminator(self, str lineterminator): + pass + + cdef CsvReaderOptionsBuilder quotechar(self, str quotechar): + pass + + cdef CsvReaderOptionsBuilder decimal(self, str decimal): + pass + + cdef CsvReaderOptionsBuilder delim_whitespace(self, bool delim_whitespace): + pass + + cdef CsvReaderOptionsBuilder skipinitialspace(self, bool skipinitialspace): + pass + + cdef CsvReaderOptionsBuilder skip_blank_lines(self, bool skip_blank_lines): + pass + + cdef CsvReaderOptionsBuilder doublequote(self, bool doublequote): + pass + + cdef CsvReaderOptionsBuilder keep_default_na(self, bool keep_default_na): + pass + + cdef CsvReaderOptionsBuilder na_filter(self, bool na_filter): + pass + + cdef CsvReaderOptionsBuilder dayfirst(self, bool dayfirst): + pass + + cdef CsvReaderOptions build(self): + pass + + def read_csv( SourceInfo source_info, *, From 39a22fcd458cb57fc7ea052c19865c65d1496666 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Fri, 22 Nov 2024 07:39:12 -0800 Subject: [PATCH 03/11] implement --- python/pylibcudf/pylibcudf/io/csv.pxd | 32 ++-- python/pylibcudf/pylibcudf/io/csv.pyx | 241 +++++++++++++++++--------- 2 files changed, 177 insertions(+), 96 deletions(-) diff --git a/python/pylibcudf/pylibcudf/io/csv.pxd b/python/pylibcudf/pylibcudf/io/csv.pxd index 7387fb941f0..8515aa23b53 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pxd +++ b/python/pylibcudf/pylibcudf/io/csv.pxd @@ -19,27 +19,23 @@ from pylibcudf.libcudf.io.types cimport ( ) from pylibcudf.libcudf.types cimport size_type -ctypedef fused DictOrList: - dict - list - cdef class CsvReaderOptions: cdef csv_reader_options c_obj cdef SourceInfo source - cpdef void set_header(size_type header) - cpdef void set_names(list col_names) - cpdef void set_prefix(str prefix) - cpdef void set_use_cols_indexes(list col_indices) - cpdef void set_use_cols_names(list col_names) - cpdef void set_delimiter(str delimiter) - cpdef void set_thousands(str thousands) - cpdef void set_comment(str comment) - cpdef void set_parse_dates(list val) - cpdef void set_parse_hex(list val) - cpdef void set_dtypes(DictOrList types) - cpdef void set_true_values(list true_values) - cpdef void set_false_values(list false_values) - cpdef void set_na_values(list na_values) + cpdef void set_header(self, size_type header) + cpdef void set_names(self, list col_names) + cpdef void set_prefix(self, str prefix) + cpdef void set_use_cols_indexes(self, list col_indices) + cpdef void set_use_cols_names(self, list col_names) + cpdef void set_delimiter(self, str delimiter) + cpdef void set_thousands(self, str thousands) + cpdef void set_comment(self, str comment) + cpdef void set_parse_dates(self, list val) + cpdef void set_parse_hex(self, list val) + cpdef void set_dtypes(self, object types) + cpdef void set_true_values(self, list true_values) + cpdef void set_false_values(self, list false_values) + cpdef void set_na_values(self, list na_values) cdef class CsvReaderOptionsBuilder: diff --git a/python/pylibcudf/pylibcudf/io/csv.pyx b/python/pylibcudf/pylibcudf/io/csv.pyx index f9ba69462ba..7c060de84ed 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pyx +++ b/python/pylibcudf/pylibcudf/io/csv.pyx @@ -55,109 +55,194 @@ cdef class CsvReaderOptions: cdef CsvReaderOptionsBuilder csv_builder = CsvReaderOptionsBuilder.__new__( CsvReaderOptionsBuilder ) - csv_builder.c_obj = csv_writer_options.builder(source.c_obj) + csv_builder.c_obj = csv_reader_options.builder(source.c_obj) csv_builder.source = source - cpdef void set_header(size_type header): + cpdef void set_header(self, size_type header): self.c_obj.set_header(header) - cpdef void set_names(list col_names): - pass - - cpdef void set_prefix(str prefix): - pass - - cpdef void set_use_cols_indexes(list col_indices): - pass - - cpdef void set_use_cols_names(list col_names): - pass - - cpdef void set_delimiter(str delimiter): - pass - - cpdef void set_thousands(str thousands): - pass - - cpdef void set_comment(str comment): - pass - - cpdef void set_parse_dates(list val): - pass - - cpdef void set_parse_hex(list val): - pass - - cpdef void set_dtypes(DictOrList types): - pass - - cpdef void set_true_values(list true_values): - pass - - cpdef void set_false_values(list false_values): - pass - - cpdef void set_na_values(list na_values): - pass + cpdef void set_names(self, list col_names): + cdef vector[string] vec + vec.reserve(len(col_names)) + for name in col_names: + vec.push_back(name.encode()) + self.c_obj.set_names(vec) + + cpdef void set_prefix(self, str prefix): + self.c_obj.set_prefix(prefix.encode()) + + cpdef void set_use_cols_indexes(self, list col_indices): + cdef vector[int] vec + vec.reserve(len(col_indices)) + for i in col_indices: + vec.push_back(i) + self.c_obj.set_use_cols_indexes(vec) + + cpdef void set_use_cols_names(self, list col_names): + cdef vector[string] vec + vec.reserve(len(col_names)) + for name in col_names: + vec.push_back(name.encode()) + self.c_obj.set_use_cols_names(vec) + + cpdef void set_delimiter(self, str delimiter): + self.c_obj.set_delimiter(ord(delimiter)) + + cpdef void set_thousands(self, str thousands): + self.c_obj.set_thousands(ord(thousands)) + + cpdef void set_comment(self, str comment): + self.c_obj.set_comment(ord(comment)) + + cpdef void set_parse_dates(self, list val): + cdef vector[string] vec_str + cdef vector[int] vec_int + if all([isinstance(date, str) for date in val]): + vec_str.reserve(len(val)) + for date in val: + vec_str.push_back(date.encode()) + self.c_obj.set_parse_dates(vec_str) + elif all([isinstance(date, int) for date in val]): + vec_int.reserve(len(val)) + for date in val: + vec_int.push_back(date) + self.c_obj.set_parse_dates(vec_int) + else: + raise TypeError("Must pass an int or str") + + cpdef void set_parse_hex(self, list[IntOrStr] val): + cdef vector[string] vec_str + cdef vector[int] vec_int + if all([isinstance(hx, str) for hx in val]): + vec_str.reserve(len(val)) + for hx in val: + vec_str.push_back(hx.encode()) + self.c_obj.set_parse_dates(vec_str) + elif all([isinstance(hx, int) for hx in val]): + vec_int.reserve(len(val)) + for hx in val: + vec_int.push_back(hx) + self.c_obj.set_parse_dates(vec_int) + else: + raise TypeError("Must pass an int or str") + + cpdef void set_dtypes(self, object types): + cdef map[string, data_type] dtype_map + cdef vector[data_type] dtype_list + if isinstance(types, dict): + for name, dtype in types.items(): + dtype_map[name.encode()] = (dtype).c_obj + self.c_obj.set_dtypes(dtype_map) + elif isinstance(types, list): + dtype_list.reserve(len(types)) + for dtype in types: + dtype_list.push_back((dtype).c_obj) + self.c_obj.set_dtypes(dtype_list) + else: + raise TypeError("Must pass an dict or list") + + cpdef void set_true_values(self, list true_values): + cdef vector[string] vec + vec.reserve(len(true_values)) + for val in true_values: + vec.push_back(val.encode()) + self.c_obj.set_true_values(vec) + + cpdef void set_false_values(self, list false_values): + cdef vector[string] vec + vec.reserve(len(false_values)) + for val in false_values: + vec.push_back(val.encode()) + self.c_obj.set_false_values(vec) + + cpdef void set_na_values(self, list na_values): + cdef vector[string] vec + vec.reserve(len(na_values)) + for val in na_values: + vec.push_back(val.encode()) + self.c_obj.set_na_values(vec) cdef class CsvReaderOptionsBuilder: - cdef CsvReaderOptionsBuilder compression(self, compression_type compression): - return self.c_obj.compression() + cpdef CsvReaderOptionsBuilder compression(self, compression_type compression): + self.c_obj.compression(compression) + return self - cdef CsvReaderOptionsBuilder mangle_dupe_cols(self, bool mangle_dupe_cols): - pass + cpdef CsvReaderOptionsBuilder mangle_dupe_cols(self, bool mangle_dupe_cols): + self.c_obj.mangle_dupe_cols(mangle_dupe_cols) + return self - cdef CsvReaderOptionsBuilder byte_range_offset(self, size_t byte_range_offset): - pass + cpdef CsvReaderOptionsBuilder byte_range_offset(self, size_t byte_range_offset): + self.c_obj.byte_range_offset(byte_range_offset) + return self - cdef CsvReaderOptionsBuilder byte_range_size(self, size_t byte_range_size): - pass + cpdef CsvReaderOptionsBuilder byte_range_size(self, size_t byte_range_size): + self.c_obj.byte_range_size(byte_range_size) + return self - cdef CsvReaderOptionsBuilder nrows(self, size_type nrows): - pass + cpdef CsvReaderOptionsBuilder nrows(self, size_type nrows): + self.c_obj.nrows(nrows) + return self - cdef CsvReaderOptionsBuilder skiprows(self, size_type skiprows): - pass + cpdef CsvReaderOptionsBuilder skiprows(self, size_type skiprows): + self.c_obj.skiprows(skiprows) + return self - cdef CsvReaderOptionsBuilder skipfooter(self, size_type skipfooter): - pass + cpdef CsvReaderOptionsBuilder skipfooter(self, size_type skipfooter): + self.c_obj.skipfooter(skipfooter) + return self - cdef CsvReaderOptionsBuilder quoting(self, quote_style quoting): - pass + cpdef CsvReaderOptionsBuilder quoting(self, quote_style quoting): + self.c_obj.quoting(quoting) + return self - cdef CsvReaderOptionsBuilder lineterminator(self, str lineterminator): - pass + cpdef CsvReaderOptionsBuilder lineterminator(self, str lineterminator): + self.c_obj.lineterminator(ord(lineterminator)) + return self - cdef CsvReaderOptionsBuilder quotechar(self, str quotechar): - pass + cpdef CsvReaderOptionsBuilder quotechar(self, str quotechar): + self.c_obj.quotechar(ord(quotechar)) + return self - cdef CsvReaderOptionsBuilder decimal(self, str decimal): - pass + cpdef CsvReaderOptionsBuilder decimal(self, str decimal): + self.c_obj.decimal(ord(decimal)) + return self - cdef CsvReaderOptionsBuilder delim_whitespace(self, bool delim_whitespace): - pass + cpdef CsvReaderOptionsBuilder delim_whitespace(self, bool delim_whitespace): + self.c_obj.delim_whitespace(delim_whitespace) + return self - cdef CsvReaderOptionsBuilder skipinitialspace(self, bool skipinitialspace): - pass + cpdef CsvReaderOptionsBuilder skipinitialspace(self, bool skipinitialspace): + self.c_obj.skipinitialspace(skipinitialspace) + return self - cdef CsvReaderOptionsBuilder skip_blank_lines(self, bool skip_blank_lines): - pass + cpdef CsvReaderOptionsBuilder skip_blank_lines(self, bool skip_blank_lines): + self.c_obj.skip_blank_lines(skip_blank_lines) + return self - cdef CsvReaderOptionsBuilder doublequote(self, bool doublequote): - pass + cpdef CsvReaderOptionsBuilder doublequote(self, bool doublequote): + self.c_obj.doublequote(doublequote) + return self - cdef CsvReaderOptionsBuilder keep_default_na(self, bool keep_default_na): - pass + cpdef CsvReaderOptionsBuilder keep_default_na(self, bool keep_default_na): + self.c_obj.keep_default_na(keep_default_na) + return self - cdef CsvReaderOptionsBuilder na_filter(self, bool na_filter): - pass + cpdef CsvReaderOptionsBuilder na_filter(self, bool na_filter): + self.c_obj.na_filter(na_filter) + return self - cdef CsvReaderOptionsBuilder dayfirst(self, bool dayfirst): - pass + cpdef CsvReaderOptionsBuilder dayfirst(self, bool dayfirst): + self.c_obj.dayfirst(dayfirst) + return self - cdef CsvReaderOptions build(self): - pass + cpdef CsvReaderOptions build(self): + cdef CsvReaderOptions csv_options = CsvReaderOptions.__new__( + CsvReaderOptions + ) + csv_options.c_obj = move(self.c_obj.build()) + csv_options.source = self.source + return csv_options def read_csv( From 9a2462b3e74a2e54de293167f91e3b97156e1034 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Fri, 22 Nov 2024 15:46:02 -0800 Subject: [PATCH 04/11] plumn changes through cudf python --- python/cudf/cudf/_lib/csv.pyx | 145 +++++++++---- python/cudf/cudf/tests/test_csv.py | 7 +- python/pylibcudf/pylibcudf/io/csv.pyx | 297 +++++++++++++------------- 3 files changed, 258 insertions(+), 191 deletions(-) diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 59a970263e0..aecedcb5081 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -164,6 +164,7 @@ def read_csv( hex_cols = [] new_dtypes = [] + print("HERE0") if dtype is not None: if isinstance(dtype, abc.Mapping): new_dtypes = dict() @@ -202,48 +203,116 @@ def read_csv( raise ValueError( "dtype should be a scalar/str/list-like/dict-like" ) - + print("HERE1") lineterminator = str(lineterminator) - df = cudf.DataFrame._from_data( - *data_from_pylibcudf_io( - plc.io.csv.read_csv( - plc.io.SourceInfo([datasource]), - lineterminator=lineterminator, - quotechar = quotechar, - quoting = quoting, - doublequote = doublequote, - header = header, - mangle_dupe_cols = mangle_dupe_cols, - usecols = usecols, - delimiter = delimiter, - delim_whitespace = delim_whitespace, - skipinitialspace = skipinitialspace, - col_names = names, - dtypes = new_dtypes, - skipfooter = skipfooter, - skiprows = skiprows, - dayfirst = dayfirst, - compression = c_compression, - thousands = thousands, - decimal = decimal, - true_values = true_values, - false_values = false_values, - nrows = nrows if nrows is not None else -1, - byte_range_offset = byte_range[0], - byte_range_size = byte_range[1], - skip_blank_lines = skip_blank_lines, - parse_dates = parse_dates, - parse_hex = hex_cols, - comment = comment, - na_values = na_values, - keep_default_na = keep_default_na, - na_filter = na_filter, - prefix = prefix, - ) - ) + # df = cudf.DataFrame._from_data( + # *data_from_pylibcudf_io( + # plc.io.csv.read_csv( + # plc.io.SourceInfo([datasource]), + # lineterminator=lineterminator, + # quotechar = quotechar, + # quoting = quoting, + # doublequote = doublequote, + # header = header, + # mangle_dupe_cols = mangle_dupe_cols, + # usecols = usecols, + # delimiter = delimiter, + # delim_whitespace = delim_whitespace, + # skipinitialspace = skipinitialspace, + # col_names = names, + # dtypes = new_dtypes, + # skipfooter = skipfooter, + # skiprows = skiprows, + # dayfirst = dayfirst, + # compression = c_compression, + # thousands = thousands, + # decimal = decimal, + # true_values = true_values, + # false_values = false_values, + # nrows = nrows if nrows is not None else -1, + # byte_range_offset = byte_range[0], + # byte_range_size = byte_range[1], + # skip_blank_lines = skip_blank_lines, + # parse_dates = parse_dates, + # parse_hex = hex_cols, + # comment = comment, + # na_values = na_values, + # keep_default_na = keep_default_na, + # na_filter = na_filter, + # prefix = prefix, + # ) + + options = ( + plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([datasource])) + .compression(c_compression) + .mangle_dupe_cols(mangle_dupe_cols) + .byte_range_offset(byte_range[0]) + .byte_range_size(byte_range[1]) + .nrows(nrows if nrows is not None else -1) + .skiprows(skiprows) + .skipfooter(skipfooter) + .quoting(quoting) + .lineterminator(lineterminator) + .quotechar(quotechar) + .decimal(decimal) + .delim_whitespace(delim_whitespace) + .skipinitialspace(skipinitialspace) + .skip_blank_lines(skip_blank_lines) + .doublequote(doublequote) + .keep_default_na(keep_default_na) + .na_filter(na_filter) + .dayfirst(dayfirst) + .build() ) + options.set_header(header) + + if names is not None: + options.set_names([str(name) for name in names]) + + if prefix is not None: + options.set_prefix(prefix) + + if usecols is not None: + if all([isinstance(col, int) for col in usecols]): + options.set_use_cols_indexes(list(usecols)) + else: + options.set_use_cols_names([str(name) for name in usecols]) + + if delimiter is not None: + options.set_delimiter(delimiter) + + if thousands is not None: + options.set_thousands(thousands) + + if comment is not None: + options.set_comment(comment) + + if parse_dates is not None: + # Set both since users are allowed to mix column names and indices + options.set_parse_dates(list(filter(lambda x: isinstance(x, str), parse_dates))) + options.set_parse_dates(list(filter(lambda x: isinstance(x, int), parse_dates))) + + if hex_cols is not None: + # Set both since users are allowed to mix column names and indices + options.set_parse_hex(list(filter(lambda x: isinstance(x, str), hex_cols))) + options.set_parse_hex(list(filter(lambda x: isinstance(x, int), hex_cols))) + + options.set_dtypes(new_dtypes) + + if true_values is not None: + options.set_true_values([str(val) for val in true_values]) + + if false_values is not None: + options.set_false_values([str(val) for val in false_values]) + + if na_values is not None: + options.set_na_values([str(val) for val in na_values]) + + df = cudf.DataFrame._from_data(*data_from_pylibcudf_io(plc.io.csv.read_csv(options))) + print("PRE GOT\n", df) + if dtype is not None: if isinstance(dtype, abc.Mapping): for k, v in dtype.items(): diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index ac772c47e3a..eb33eb5402b 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -240,7 +240,8 @@ def test_csv_reader_numeric_data(dtype, nelem, tmpdir): assert_eq(df, out) -@pytest.mark.parametrize("parse_dates", [["date2"], [0], ["date1", 1, "bad"]]) +# @pytest.mark.parametrize("parse_dates", [["date2"], [0], ["date1", 1, "bad"]]) +@pytest.mark.parametrize("parse_dates", [["date2"]]) def test_csv_reader_datetime(parse_dates): df = make_datetime_dataframe(include_non_standard=True) buffer = df.to_csv(index=False, header=False) @@ -248,7 +249,7 @@ def test_csv_reader_datetime(parse_dates): gdf = read_csv( StringIO(buffer), names=["date1", "date2", "bad"], - parse_dates=parse_dates, + # parse_dates=parse_dates, dayfirst=True, ) # Need to used `date_format='mixed'`, @@ -260,6 +261,8 @@ def test_csv_reader_datetime(parse_dates): dayfirst=True, date_format="mixed", ) + print("GOT\n", gdf) + print("EXPECT\n", pdf) assert_eq(gdf, pdf) diff --git a/python/pylibcudf/pylibcudf/io/csv.pyx b/python/pylibcudf/pylibcudf/io/csv.pyx index 7c060de84ed..1e6ed2b8f17 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pyx +++ b/python/pylibcudf/pylibcudf/io/csv.pyx @@ -57,13 +57,13 @@ cdef class CsvReaderOptions: ) csv_builder.c_obj = csv_reader_options.builder(source.c_obj) csv_builder.source = source + return csv_builder cpdef void set_header(self, size_type header): self.c_obj.set_header(header) cpdef void set_names(self, list col_names): cdef vector[string] vec - vec.reserve(len(col_names)) for name in col_names: vec.push_back(name.encode()) self.c_obj.set_names(vec) @@ -73,14 +73,12 @@ cdef class CsvReaderOptions: cpdef void set_use_cols_indexes(self, list col_indices): cdef vector[int] vec - vec.reserve(len(col_indices)) for i in col_indices: vec.push_back(i) self.c_obj.set_use_cols_indexes(vec) cpdef void set_use_cols_names(self, list col_names): cdef vector[string] vec - vec.reserve(len(col_names)) for name in col_names: vec.push_back(name.encode()) self.c_obj.set_use_cols_names(vec) @@ -98,31 +96,27 @@ cdef class CsvReaderOptions: cdef vector[string] vec_str cdef vector[int] vec_int if all([isinstance(date, str) for date in val]): - vec_str.reserve(len(val)) for date in val: vec_str.push_back(date.encode()) self.c_obj.set_parse_dates(vec_str) elif all([isinstance(date, int) for date in val]): - vec_int.reserve(len(val)) for date in val: vec_int.push_back(date) self.c_obj.set_parse_dates(vec_int) else: raise TypeError("Must pass an int or str") - cpdef void set_parse_hex(self, list[IntOrStr] val): + cpdef void set_parse_hex(self, list val): cdef vector[string] vec_str cdef vector[int] vec_int if all([isinstance(hx, str) for hx in val]): - vec_str.reserve(len(val)) for hx in val: vec_str.push_back(hx.encode()) - self.c_obj.set_parse_dates(vec_str) + self.c_obj.set_parse_hex(vec_str) elif all([isinstance(hx, int) for hx in val]): - vec_int.reserve(len(val)) for hx in val: vec_int.push_back(hx) - self.c_obj.set_parse_dates(vec_int) + self.c_obj.set_parse_hex(vec_int) else: raise TypeError("Must pass an int or str") @@ -130,11 +124,12 @@ cdef class CsvReaderOptions: cdef map[string, data_type] dtype_map cdef vector[data_type] dtype_list if isinstance(types, dict): + print("DICT", types) for name, dtype in types.items(): - dtype_map[name.encode()] = (dtype).c_obj + dtype_map[str(name).encode()] = (dtype).c_obj self.c_obj.set_dtypes(dtype_map) elif isinstance(types, list): - dtype_list.reserve(len(types)) + print("LIST", types) for dtype in types: dtype_list.push_back((dtype).c_obj) self.c_obj.set_dtypes(dtype_list) @@ -143,21 +138,18 @@ cdef class CsvReaderOptions: cpdef void set_true_values(self, list true_values): cdef vector[string] vec - vec.reserve(len(true_values)) for val in true_values: vec.push_back(val.encode()) self.c_obj.set_true_values(vec) cpdef void set_false_values(self, list false_values): cdef vector[string] vec - vec.reserve(len(false_values)) for val in false_values: vec.push_back(val.encode()) self.c_obj.set_false_values(vec) cpdef void set_na_values(self, list na_values): cdef vector[string] vec - vec.reserve(len(na_values)) for val in na_values: vec.push_back(val.encode()) self.c_obj.set_na_values(vec) @@ -246,46 +238,47 @@ cdef class CsvReaderOptionsBuilder: def read_csv( - SourceInfo source_info, - *, - compression_type compression = compression_type.AUTO, - size_t byte_range_offset = 0, - size_t byte_range_size = 0, - list col_names = None, - str prefix = "", - bool mangle_dupe_cols = True, - list usecols = None, - size_type nrows = -1, - size_type skiprows = 0, - size_type skipfooter = 0, - size_type header = 0, - str lineterminator = "\n", - str delimiter = None, - str thousands = None, - str decimal = ".", - str comment = None, - bool delim_whitespace = False, - bool skipinitialspace = False, - bool skip_blank_lines = True, - quote_style quoting = quote_style.MINIMAL, - str quotechar = '"', - bool doublequote = True, - list parse_dates = None, - list parse_hex = None, - # Technically this should be dict/list - # but using a fused type prevents using None as default - object dtypes = None, - list true_values = None, - list false_values = None, - list na_values = None, - bool keep_default_na = True, - bool na_filter = True, - bool dayfirst = False, - # Note: These options are supported by the libcudf reader - # but are not exposed here since there is no demand for them - # on the Python side yet. - # bool detect_whitespace_around_quotes = False, - # DataType timestamp_type = DataType(type_id.EMPTY), + # SourceInfo source_info, + # *, + # compression_type compression = compression_type.AUTO, + # size_t byte_range_offset = 0, + # size_t byte_range_size = 0, + # list col_names = None, + # str prefix = "", + # bool mangle_dupe_cols = True, + # list usecols = None, + # size_type nrows = -1, + # size_type skiprows = 0, + # size_type skipfooter = 0, + # size_type header = 0, + # str lineterminator = "\n", + # str delimiter = None, + # str thousands = None, + # str decimal = ".", + # str comment = None, + # bool delim_whitespace = False, + # bool skipinitialspace = False, + # bool skip_blank_lines = True, + # quote_style quoting = quote_style.MINIMAL, + # str quotechar = '"', + # bool doublequote = True, + # list parse_dates = None, + # list parse_hex = None, + # # Technically this should be dict/list + # # but using a fused type prevents using None as default + # object dtypes = None, + # list true_values = None, + # list false_values = None, + # list na_values = None, + # bool keep_default_na = True, + # bool na_filter = True, + # bool dayfirst = False, + # # Note: These options are supported by the libcudf reader + # # but are not exposed here since there is no demand for them + # # on the Python side yet. + # # bool detect_whitespace_around_quotes = False, + # # DataType timestamp_type = DataType(type_id.EMPTY), + CsvReaderOptions options ): """Reads a CSV file into a :py:class:`~.types.TableWithMetadata`. @@ -372,106 +365,108 @@ def read_csv( TableWithMetadata The Table and its corresponding metadata (column names) that were read in. """ - cdef vector[string] c_parse_dates_names - cdef vector[int] c_parse_dates_indexes - cdef vector[int] c_parse_hex_names - cdef vector[int] c_parse_hex_indexes - cdef vector[data_type] c_dtypes_list - cdef map[string, data_type] c_dtypes_map - - cdef csv_reader_options options = ( - csv_reader_options.builder(source_info.c_obj) - .compression(compression) - .mangle_dupe_cols(mangle_dupe_cols) - .byte_range_offset(byte_range_offset) - .byte_range_size(byte_range_size) - .nrows(nrows) - .skiprows(skiprows) - .skipfooter(skipfooter) - .quoting(quoting) - .lineterminator(ord(lineterminator)) - .quotechar(ord(quotechar)) - .decimal(ord(decimal)) - .delim_whitespace(delim_whitespace) - .skipinitialspace(skipinitialspace) - .skip_blank_lines(skip_blank_lines) - .doublequote(doublequote) - .keep_default_na(keep_default_na) - .na_filter(na_filter) - .dayfirst(dayfirst) - .build() - ) - - options.set_header(header) - - if col_names is not None: - options.set_names([str(name).encode() for name in col_names]) - - if prefix is not None: - options.set_prefix(prefix.encode()) - - if usecols is not None: - if all([isinstance(col, int) for col in usecols]): - options.set_use_cols_indexes(list(usecols)) - else: - options.set_use_cols_names([str(name).encode() for name in usecols]) - - if delimiter is not None: - options.set_delimiter(ord(delimiter)) - - if thousands is not None: - options.set_thousands(ord(thousands)) - - if comment is not None: - options.set_comment(ord(comment)) - - if parse_dates is not None: - if not all([isinstance(col, (str, int)) for col in parse_dates]): - raise NotImplementedError( - "`parse_dates`: Must pass a list of column names/indices") - - # Set both since users are allowed to mix column names and indices - c_parse_dates_names, c_parse_dates_indexes = \ - _process_parse_dates_hex(parse_dates) - options.set_parse_dates(c_parse_dates_names) - options.set_parse_dates(c_parse_dates_indexes) - - if parse_hex is not None: - if not all([isinstance(col, (str, int)) for col in parse_hex]): - raise NotImplementedError( - "`parse_hex`: Must pass a list of column names/indices") - - # Set both since users are allowed to mix column names and indices - c_parse_hex_names, c_parse_hex_indexes = _process_parse_dates_hex(parse_hex) - options.set_parse_hex(c_parse_hex_names) - options.set_parse_hex(c_parse_hex_indexes) - - if isinstance(dtypes, list): - for dtype in dtypes: - c_dtypes_list.push_back((dtype).c_obj) - options.set_dtypes(c_dtypes_list) - elif isinstance(dtypes, dict): - # dtypes_t is dict - for k, v in dtypes.items(): - c_dtypes_map[str(k).encode()] = (v).c_obj - options.set_dtypes(c_dtypes_map) - elif dtypes is not None: - raise TypeError("dtypes must either by a list/dict") - - if true_values is not None: - options.set_true_values(_make_str_vector(true_values)) - - if false_values is not None: - options.set_false_values(_make_str_vector(false_values)) - - if na_values is not None: - options.set_na_values(_make_str_vector(na_values)) + # cdef vector[string] c_parse_dates_names + # cdef vector[int] c_parse_dates_indexes + # cdef vector[int] c_parse_hex_names + # cdef vector[int] c_parse_hex_indexes + # cdef vector[data_type] c_dtypes_list + # cdef map[string, data_type] c_dtypes_map + + # cdef csv_reader_options options = ( + # csv_reader_options.builder(source_info.c_obj) + # .compression(compression) + # .mangle_dupe_cols(mangle_dupe_cols) + # .byte_range_offset(byte_range_offset) + # .byte_range_size(byte_range_size) + # .nrows(nrows) + # .skiprows(skiprows) + # .skipfooter(skipfooter) + # .quoting(quoting) + # .lineterminator(ord(lineterminator)) + # .quotechar(ord(quotechar)) + # .decimal(ord(decimal)) + # .delim_whitespace(delim_whitespace) + # .skipinitialspace(skipinitialspace) + # .skip_blank_lines(skip_blank_lines) + # .doublequote(doublequote) + # .keep_default_na(keep_default_na) + # .na_filter(na_filter) + # .dayfirst(dayfirst) + # .build() + # ) + + # options.set_header(header) + + # if col_names is not None: + # options.set_names([str(name).encode() for name in col_names]) + + # if prefix is not None: + # options.set_prefix(prefix.encode()) + + # if usecols is not None: + # if all([isinstance(col, int) for col in usecols]): + # options.set_use_cols_indexes(list(usecols)) + # else: + # options.set_use_cols_names([str(name).encode() for name in usecols]) + + # if delimiter is not None: + # options.set_delimiter(ord(delimiter)) + + # if thousands is not None: + # options.set_thousands(ord(thousands)) + + # if comment is not None: + # options.set_comment(ord(comment)) + + # if parse_dates is not None: + # if not all([isinstance(col, (str, int)) for col in parse_dates]): + # raise NotImplementedError( + # "`parse_dates`: Must pass a list of column names/indices") + + # # Set both since users are allowed to mix column names and indices + # c_parse_dates_names, c_parse_dates_indexes = \ + # _process_parse_dates_hex(parse_dates) + # options.set_parse_dates(c_parse_dates_names) + # options.set_parse_dates(c_parse_dates_indexes) + + # if parse_hex is not None: + # if not all([isinstance(col, (str, int)) for col in parse_hex]): + # raise NotImplementedError( + # "`parse_hex`: Must pass a list of column names/indices") + + # # Set both since users are allowed to mix column names and indices + # c_parse_hex_names, c_parse_hex_indexes = _process_parse_dates_hex(parse_hex) + # options.set_parse_hex(c_parse_hex_names) + # options.set_parse_hex(c_parse_hex_indexes) + + # if isinstance(dtypes, list): + # for dtype in dtypes: + # c_dtypes_list.push_back((dtype).c_obj) + # options.set_dtypes(c_dtypes_list) + # elif isinstance(dtypes, dict): + # # dtypes_t is dict + # for k, v in dtypes.items(): + # c_dtypes_map[str(k).encode()] = (v).c_obj + # options.set_dtypes(c_dtypes_map) + # elif dtypes is not None: + # raise TypeError("dtypes must either by a list/dict") + + # if true_values is not None: + # options.set_true_values(_make_str_vector(true_values)) + + # if false_values is not None: + # options.set_false_values(_make_str_vector(false_values)) + + # if na_values is not None: + # options.set_na_values(_make_str_vector(na_values)) cdef table_with_metadata c_result with nogil: - c_result = move(cpp_read_csv(options)) + c_result = move(cpp_read_csv(options.c_obj)) - return TableWithMetadata.from_libcudf(c_result) + cdef TableWithMetadata tbl_meta = TableWithMetadata.from_libcudf(c_result) + print("PRE PRE GOT\n", tbl_meta) + return tbl_meta # TODO: Implement the remaining methods From 40aba23067a5f944cda884b29a6b387a96ba67bf Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 25 Nov 2024 05:53:38 -0800 Subject: [PATCH 05/11] implement --- python/cudf/cudf/_lib/csv.pyx | 47 +-------- python/cudf/cudf/tests/test_csv.py | 2 - python/pylibcudf/pylibcudf/io/csv.pyx | 137 -------------------------- 3 files changed, 3 insertions(+), 183 deletions(-) diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index aecedcb5081..88c0a583c8d 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -160,11 +160,8 @@ def read_csv( header = -1 elif header == 'infer': header = 0 - hex_cols = [] - new_dtypes = [] - print("HERE0") if dtype is not None: if isinstance(dtype, abc.Mapping): new_dtypes = dict() @@ -203,46 +200,7 @@ def read_csv( raise ValueError( "dtype should be a scalar/str/list-like/dict-like" ) - print("HERE1") lineterminator = str(lineterminator) - - # df = cudf.DataFrame._from_data( - # *data_from_pylibcudf_io( - # plc.io.csv.read_csv( - # plc.io.SourceInfo([datasource]), - # lineterminator=lineterminator, - # quotechar = quotechar, - # quoting = quoting, - # doublequote = doublequote, - # header = header, - # mangle_dupe_cols = mangle_dupe_cols, - # usecols = usecols, - # delimiter = delimiter, - # delim_whitespace = delim_whitespace, - # skipinitialspace = skipinitialspace, - # col_names = names, - # dtypes = new_dtypes, - # skipfooter = skipfooter, - # skiprows = skiprows, - # dayfirst = dayfirst, - # compression = c_compression, - # thousands = thousands, - # decimal = decimal, - # true_values = true_values, - # false_values = false_values, - # nrows = nrows if nrows is not None else -1, - # byte_range_offset = byte_range[0], - # byte_range_size = byte_range[1], - # skip_blank_lines = skip_blank_lines, - # parse_dates = parse_dates, - # parse_hex = hex_cols, - # comment = comment, - # na_values = na_values, - # keep_default_na = keep_default_na, - # na_filter = na_filter, - # prefix = prefix, - # ) - options = ( plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([datasource])) .compression(c_compression) @@ -310,8 +268,9 @@ def read_csv( if na_values is not None: options.set_na_values([str(val) for val in na_values]) - df = cudf.DataFrame._from_data(*data_from_pylibcudf_io(plc.io.csv.read_csv(options))) - print("PRE GOT\n", df) + df = cudf.DataFrame._from_data( + *data_from_pylibcudf_io(plc.io.csv.read_csv(options)) + ) if dtype is not None: if isinstance(dtype, abc.Mapping): diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index eb33eb5402b..66e14b7bb24 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -261,8 +261,6 @@ def test_csv_reader_datetime(parse_dates): dayfirst=True, date_format="mixed", ) - print("GOT\n", gdf) - print("EXPECT\n", pdf) assert_eq(gdf, pdf) diff --git a/python/pylibcudf/pylibcudf/io/csv.pyx b/python/pylibcudf/pylibcudf/io/csv.pyx index 1e6ed2b8f17..b1a4af27042 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pyx +++ b/python/pylibcudf/pylibcudf/io/csv.pyx @@ -124,12 +124,10 @@ cdef class CsvReaderOptions: cdef map[string, data_type] dtype_map cdef vector[data_type] dtype_list if isinstance(types, dict): - print("DICT", types) for name, dtype in types.items(): dtype_map[str(name).encode()] = (dtype).c_obj self.c_obj.set_dtypes(dtype_map) elif isinstance(types, list): - print("LIST", types) for dtype in types: dtype_list.push_back((dtype).c_obj) self.c_obj.set_dtypes(dtype_list) @@ -238,46 +236,6 @@ cdef class CsvReaderOptionsBuilder: def read_csv( - # SourceInfo source_info, - # *, - # compression_type compression = compression_type.AUTO, - # size_t byte_range_offset = 0, - # size_t byte_range_size = 0, - # list col_names = None, - # str prefix = "", - # bool mangle_dupe_cols = True, - # list usecols = None, - # size_type nrows = -1, - # size_type skiprows = 0, - # size_type skipfooter = 0, - # size_type header = 0, - # str lineterminator = "\n", - # str delimiter = None, - # str thousands = None, - # str decimal = ".", - # str comment = None, - # bool delim_whitespace = False, - # bool skipinitialspace = False, - # bool skip_blank_lines = True, - # quote_style quoting = quote_style.MINIMAL, - # str quotechar = '"', - # bool doublequote = True, - # list parse_dates = None, - # list parse_hex = None, - # # Technically this should be dict/list - # # but using a fused type prevents using None as default - # object dtypes = None, - # list true_values = None, - # list false_values = None, - # list na_values = None, - # bool keep_default_na = True, - # bool na_filter = True, - # bool dayfirst = False, - # # Note: These options are supported by the libcudf reader - # # but are not exposed here since there is no demand for them - # # on the Python side yet. - # # bool detect_whitespace_around_quotes = False, - # # DataType timestamp_type = DataType(type_id.EMPTY), CsvReaderOptions options ): """Reads a CSV file into a :py:class:`~.types.TableWithMetadata`. @@ -365,107 +323,12 @@ def read_csv( TableWithMetadata The Table and its corresponding metadata (column names) that were read in. """ - # cdef vector[string] c_parse_dates_names - # cdef vector[int] c_parse_dates_indexes - # cdef vector[int] c_parse_hex_names - # cdef vector[int] c_parse_hex_indexes - # cdef vector[data_type] c_dtypes_list - # cdef map[string, data_type] c_dtypes_map - - # cdef csv_reader_options options = ( - # csv_reader_options.builder(source_info.c_obj) - # .compression(compression) - # .mangle_dupe_cols(mangle_dupe_cols) - # .byte_range_offset(byte_range_offset) - # .byte_range_size(byte_range_size) - # .nrows(nrows) - # .skiprows(skiprows) - # .skipfooter(skipfooter) - # .quoting(quoting) - # .lineterminator(ord(lineterminator)) - # .quotechar(ord(quotechar)) - # .decimal(ord(decimal)) - # .delim_whitespace(delim_whitespace) - # .skipinitialspace(skipinitialspace) - # .skip_blank_lines(skip_blank_lines) - # .doublequote(doublequote) - # .keep_default_na(keep_default_na) - # .na_filter(na_filter) - # .dayfirst(dayfirst) - # .build() - # ) - - # options.set_header(header) - - # if col_names is not None: - # options.set_names([str(name).encode() for name in col_names]) - - # if prefix is not None: - # options.set_prefix(prefix.encode()) - - # if usecols is not None: - # if all([isinstance(col, int) for col in usecols]): - # options.set_use_cols_indexes(list(usecols)) - # else: - # options.set_use_cols_names([str(name).encode() for name in usecols]) - - # if delimiter is not None: - # options.set_delimiter(ord(delimiter)) - - # if thousands is not None: - # options.set_thousands(ord(thousands)) - - # if comment is not None: - # options.set_comment(ord(comment)) - - # if parse_dates is not None: - # if not all([isinstance(col, (str, int)) for col in parse_dates]): - # raise NotImplementedError( - # "`parse_dates`: Must pass a list of column names/indices") - - # # Set both since users are allowed to mix column names and indices - # c_parse_dates_names, c_parse_dates_indexes = \ - # _process_parse_dates_hex(parse_dates) - # options.set_parse_dates(c_parse_dates_names) - # options.set_parse_dates(c_parse_dates_indexes) - - # if parse_hex is not None: - # if not all([isinstance(col, (str, int)) for col in parse_hex]): - # raise NotImplementedError( - # "`parse_hex`: Must pass a list of column names/indices") - - # # Set both since users are allowed to mix column names and indices - # c_parse_hex_names, c_parse_hex_indexes = _process_parse_dates_hex(parse_hex) - # options.set_parse_hex(c_parse_hex_names) - # options.set_parse_hex(c_parse_hex_indexes) - - # if isinstance(dtypes, list): - # for dtype in dtypes: - # c_dtypes_list.push_back((dtype).c_obj) - # options.set_dtypes(c_dtypes_list) - # elif isinstance(dtypes, dict): - # # dtypes_t is dict - # for k, v in dtypes.items(): - # c_dtypes_map[str(k).encode()] = (v).c_obj - # options.set_dtypes(c_dtypes_map) - # elif dtypes is not None: - # raise TypeError("dtypes must either by a list/dict") - - # if true_values is not None: - # options.set_true_values(_make_str_vector(true_values)) - - # if false_values is not None: - # options.set_false_values(_make_str_vector(false_values)) - - # if na_values is not None: - # options.set_na_values(_make_str_vector(na_values)) cdef table_with_metadata c_result with nogil: c_result = move(cpp_read_csv(options.c_obj)) cdef TableWithMetadata tbl_meta = TableWithMetadata.from_libcudf(c_result) - print("PRE PRE GOT\n", tbl_meta) return tbl_meta From 79dbafaeb74e852694d2848f65eff4337e6b94d7 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 25 Nov 2024 10:06:56 -0800 Subject: [PATCH 06/11] plumb through tests --- python/cudf/cudf/_lib/csv.pyx | 8 +- python/cudf/cudf/tests/test_csv.py | 5 +- python/pylibcudf/pylibcudf/io/csv.pyx | 115 ++--------- .../pylibcudf/pylibcudf/tests/io/test_csv.py | 183 +++++++++++++++--- 4 files changed, 171 insertions(+), 140 deletions(-) diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 88c0a583c8d..c839b5b38a0 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -248,14 +248,10 @@ def read_csv( options.set_comment(comment) if parse_dates is not None: - # Set both since users are allowed to mix column names and indices - options.set_parse_dates(list(filter(lambda x: isinstance(x, str), parse_dates))) - options.set_parse_dates(list(filter(lambda x: isinstance(x, int), parse_dates))) + options.set_parse_dates(list(parse_dates)) if hex_cols is not None: - # Set both since users are allowed to mix column names and indices - options.set_parse_hex(list(filter(lambda x: isinstance(x, str), hex_cols))) - options.set_parse_hex(list(filter(lambda x: isinstance(x, int), hex_cols))) + options.set_parse_hex(list(hex_cols)) options.set_dtypes(new_dtypes) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 66e14b7bb24..ac772c47e3a 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -240,8 +240,7 @@ def test_csv_reader_numeric_data(dtype, nelem, tmpdir): assert_eq(df, out) -# @pytest.mark.parametrize("parse_dates", [["date2"], [0], ["date1", 1, "bad"]]) -@pytest.mark.parametrize("parse_dates", [["date2"]]) +@pytest.mark.parametrize("parse_dates", [["date2"], [0], ["date1", 1, "bad"]]) def test_csv_reader_datetime(parse_dates): df = make_datetime_dataframe(include_non_standard=True) buffer = df.to_csv(index=False, header=False) @@ -249,7 +248,7 @@ def test_csv_reader_datetime(parse_dates): gdf = read_csv( StringIO(buffer), names=["date1", "date2", "bad"], - # parse_dates=parse_dates, + parse_dates=parse_dates, dayfirst=True, ) # Need to used `date_format='mixed'`, diff --git a/python/pylibcudf/pylibcudf/io/csv.pyx b/python/pylibcudf/pylibcudf/io/csv.pyx index b1a4af27042..507da3b33c2 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pyx +++ b/python/pylibcudf/pylibcudf/io/csv.pyx @@ -95,30 +95,31 @@ cdef class CsvReaderOptions: cpdef void set_parse_dates(self, list val): cdef vector[string] vec_str cdef vector[int] vec_int - if all([isinstance(date, str) for date in val]): + if not all([isinstance(col, (str, int)) for col in val]): + raise TypeError("Must be a list of int or str") + else: for date in val: - vec_str.push_back(date.encode()) + if isinstance(date, str): + vec_str.push_back(date.encode()) + else: + vec_int.push_back(date) self.c_obj.set_parse_dates(vec_str) - elif all([isinstance(date, int) for date in val]): - for date in val: - vec_int.push_back(date) self.c_obj.set_parse_dates(vec_int) - else: - raise TypeError("Must pass an int or str") cpdef void set_parse_hex(self, list val): cdef vector[string] vec_str cdef vector[int] vec_int - if all([isinstance(hx, str) for hx in val]): + if not all([isinstance(col, (str, int)) for col in val]): + raise TypeError("Must be a list of int or str") + else: for hx in val: - vec_str.push_back(hx.encode()) + if isinstance(hx, str): + vec_str.push_back(hx.encode()) + else: + vec_int.push_back(hx) + self.c_obj.set_parse_hex(vec_str) - elif all([isinstance(hx, int) for hx in val]): - for hx in val: - vec_int.push_back(hx) self.c_obj.set_parse_hex(vec_int) - else: - raise TypeError("Must pass an int or str") cpdef void set_dtypes(self, object types): cdef map[string, data_type] dtype_map @@ -238,92 +239,6 @@ cdef class CsvReaderOptionsBuilder: def read_csv( CsvReaderOptions options ): - """Reads a CSV file into a :py:class:`~.types.TableWithMetadata`. - - For details, see :cpp:func:`read_csv`. - - Parameters - ---------- - source_info : SourceInfo - The SourceInfo to read the CSV file from. - compression : compression_type, default CompressionType.AUTO - The compression format of the CSV source. - byte_range_offset : size_type, default 0 - Number of bytes to skip from source start. - byte_range_size : size_type, default 0 - Number of bytes to read. By default, will read all bytes. - col_names : list, default None - The column names to use. - prefix : string, default '' - The prefix to apply to the column names. - mangle_dupe_cols : bool, default True - If True, rename duplicate column names. - usecols : list, default None - Specify the string column names/integer column indices of columns to be read. - nrows : size_type, default -1 - The number of rows to read. - skiprows : size_type, default 0 - The number of rows to skip from the start before reading - skipfooter : size_type, default 0 - The number of rows to skip from the end - header : size_type, default 0 - The index of the row that will be used for header names. - Pass -1 to use default column names. - lineterminator : str, default '\\n' - The character used to determine the end of a line. - delimiter : str, default "," - The character used to separate fields in a row. - thousands : str, default None - The character used as the thousands separator. - Cannot match delimiter. - decimal : str, default '.' - The character used as the decimal separator. - Cannot match delimiter. - comment : str, default None - The character used to identify the start of a comment line. - (which will be skipped by the reader) - delim_whitespace : bool, default False - If True, treat whitespace as the field delimiter. - skipinitialspace : bool, default False - If True, skip whitespace after the delimiter. - skip_blank_lines : bool, default True - If True, ignore empty lines (otherwise line values are parsed as null). - quoting : QuoteStyle, default QuoteStyle.MINIMAL - The quoting style used in the input CSV data. One of - { QuoteStyle.MINIMAL, QuoteStyle.ALL, QuoteStyle.NONNUMERIC, QuoteStyle.NONE } - quotechar : str, default '"' - The character used to indicate quoting. - doublequote : bool, default True - If True, a quote inside a value is double-quoted. - parse_dates : list, default None - A list of integer column indices/string column names - of columns to read as datetime. - parse_hex : list, default None - A list of integer column indices/string column names - of columns to read as hexadecimal. - dtypes : Union[Dict[str, DataType], List[DataType]], default None - A list of data types or a dictionary mapping column names - to a DataType. - true_values : List[str], default None - A list of additional values to recognize as True. - false_values : List[str], default None - A list of additional values to recognize as False. - na_values : List[str], default None - A list of additional values to recognize as null. - keep_default_na : bool, default True - Whether to keep the built-in default N/A values. - na_filter : bool, default True - Whether to detect missing values. If False, can - improve performance. - dayfirst : bool, default False - If True, interpret dates as being in the DD/MM format. - - Returns - ------- - TableWithMetadata - The Table and its corresponding metadata (column names) that were read in. - """ - cdef table_with_metadata c_result with nogil: c_result = move(cpp_read_csv(options.c_obj)) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py index 90d2d0896a5..952847c16b2 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_csv.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py @@ -77,14 +77,31 @@ def test_read_csv_basic( offset=skiprows, length=nrows if nrows != -1 else None ) - res = plc.io.csv.read_csv( - plc.io.SourceInfo([source]), - delimiter=delimiter, - compression=compression_type, - col_names=column_names, - nrows=nrows, - skiprows=skiprows, + options = ( + plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([source])) + .compression(compression_type) + .mangle_dupe_cols(True) + .byte_range_offset(0) + .byte_range_size(0) + .nrows(nrows) + .skiprows(skiprows) + .skipfooter(0) + .quoting(0) + .lineterminator("\n") + .quotechar('"') + .decimal(".") + .delim_whitespace(False) + .skipinitialspace(False) + .skip_blank_lines(True) + .doublequote(True) + .keep_default_na(True) + .na_filter(True) + .dayfirst(False) + .build() ) + options.set_delimiter(delimiter) + options.set_names([str(name) for name in column_names]) + res = plc.io.csv.read_csv(options) assert_table_and_meta_eq( pa_table, @@ -110,15 +127,31 @@ def test_read_csv_byte_range(table_data, chunk_size, tmp_path): file_size = os.stat(source).st_size tbls_w_meta = [] for segment in range((file_size + chunk_size - 1) // chunk_size): - tbls_w_meta.append( - plc.io.csv.read_csv( - plc.io.SourceInfo([source]), - byte_range_offset=segment * chunk_size, - byte_range_size=chunk_size, - header=-1, - col_names=pa_table.column_names, - ) + options = ( + plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([source])) + .compression(plc.io.types.CompressionType.AUTO) + .mangle_dupe_cols(True) + .byte_range_offset(segment * chunk_size) + .byte_range_size(chunk_size) + .nrows(-1) + .skiprows(0) + .skipfooter(0) + .quoting(0) + .lineterminator("\n") + .quotechar('"') + .decimal(".") + .delim_whitespace(False) + .skipinitialspace(False) + .skip_blank_lines(True) + .doublequote(True) + .keep_default_na(True) + .na_filter(True) + .dayfirst(False) + .build() ) + options.set_header(-1) + options.set_names([str(name) for name in pa_table.column_names]) + tbls_w_meta.append(plc.io.csv.read_csv(options)) if isinstance(source, io.IOBase): source.seek(0) exp = pd.read_csv(source, names=pa_table.column_names, header=None) @@ -161,9 +194,35 @@ def test_read_csv_dtypes(csv_table_data, source_or_sink, usecols): new_schema = pa.schema(new_fields) - res = plc.io.csv.read_csv( - plc.io.SourceInfo([source]), dtypes=dtypes, usecols=usecols + options = ( + plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([source])) + .compression(plc.io.types.CompressionType.AUTO) + .mangle_dupe_cols(True) + .byte_range_offset(0) + .byte_range_size(0) + .nrows(-1) + .skiprows(0) + .skipfooter(0) + .quoting(0) + .lineterminator("\n") + .quotechar('"') + .decimal(".") + .delim_whitespace(False) + .skipinitialspace(False) + .skip_blank_lines(True) + .doublequote(True) + .keep_default_na(True) + .na_filter(True) + .dayfirst(False) + .build() ) + options.set_dtypes(dtypes) + if usecols is not None: + if all([isinstance(col, int) for col in usecols]): + options.set_use_cols_indexes(list(usecols)) + else: + options.set_use_cols_names([str(name) for name in usecols]) + res = plc.io.csv.read_csv(options) new_table = pa_table.cast(new_schema) assert_table_and_meta_eq(new_table, res) @@ -171,7 +230,7 @@ def test_read_csv_dtypes(csv_table_data, source_or_sink, usecols): @pytest.mark.parametrize("skip_blanks", [True, False]) @pytest.mark.parametrize("decimal, quotechar", [(".", "'"), ("_", '"')]) -@pytest.mark.parametrize("lineterminator", ["\n", "\r\n"]) +@pytest.mark.parametrize("lineterminator", ["\n", "\t"]) def test_read_csv_parse_options( source_or_sink, decimal, quotechar, skip_blanks, lineterminator ): @@ -188,19 +247,39 @@ def test_read_csv_parse_options( write_source_str(source_or_sink, buffer) - plc_table_w_meta = plc.io.csv.read_csv( - plc.io.SourceInfo([source_or_sink]), - comment="#", - decimal=decimal, - skip_blank_lines=skip_blanks, - quotechar=quotechar, + options = ( + plc.io.csv.CsvReaderOptions.builder( + plc.io.SourceInfo([source_or_sink]) + ) + .compression(plc.io.types.CompressionType.AUTO) + .mangle_dupe_cols(True) + .byte_range_offset(0) + .byte_range_size(0) + .nrows(-1) + .skiprows(0) + .skipfooter(0) + .quoting(0) + .lineterminator(lineterminator) + .quotechar(quotechar) + .decimal(decimal) + .delim_whitespace(False) + .skipinitialspace(False) + .skip_blank_lines(skip_blanks) + .doublequote(True) + .keep_default_na(True) + .na_filter(True) + .dayfirst(False) + .build() ) + options.set_comment("#") + plc_table_w_meta = plc.io.csv.read_csv(options) df = pd.read_csv( StringIO(buffer), comment="#", decimal=decimal, skip_blank_lines=skip_blanks, quotechar=quotechar, + lineterminator=lineterminator, ) assert_table_and_meta_eq(pa.Table.from_pandas(df), plc_table_w_meta) @@ -216,12 +295,33 @@ def test_read_csv_na_values( write_source_str(source_or_sink, buffer) - plc_table_w_meta = plc.io.csv.read_csv( - plc.io.SourceInfo([source_or_sink]), - na_filter=na_filter, - na_values=na_values if na_filter else None, - keep_default_na=keep_default_na, + options = ( + plc.io.csv.CsvReaderOptions.builder( + plc.io.SourceInfo([source_or_sink]) + ) + .compression(plc.io.types.CompressionType.AUTO) + .mangle_dupe_cols(True) + .byte_range_offset(0) + .byte_range_size(0) + .nrows(-1) + .skiprows(0) + .skipfooter(0) + .quoting(0) + .lineterminator("\n") + .quotechar('"') + .decimal(".") + .delim_whitespace(False) + .skipinitialspace(False) + .skip_blank_lines(True) + .doublequote(True) + .keep_default_na(keep_default_na) + .na_filter(na_filter) + .dayfirst(False) + .build() ) + if na_filter and na_values is not None: + options.set_na_values(na_values) + plc_table_w_meta = plc.io.csv.read_csv(options) df = pd.read_csv( StringIO(buffer), na_filter=na_filter, @@ -241,9 +341,30 @@ def test_read_csv_header(csv_table_data, source_or_sink, header): **_COMMON_CSV_SOURCE_KWARGS, ) - plc_table_w_meta = plc.io.csv.read_csv( - plc.io.SourceInfo([source]), header=header + options = ( + plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([source])) + .compression(plc.io.types.CompressionType.AUTO) + .mangle_dupe_cols(True) + .byte_range_offset(0) + .byte_range_size(0) + .nrows(-1) + .skiprows(0) + .skipfooter(0) + .quoting(0) + .lineterminator("\n") + .quotechar('"') + .decimal(".") + .delim_whitespace(False) + .skipinitialspace(False) + .skip_blank_lines(True) + .doublequote(True) + .keep_default_na(True) + .na_filter(True) + .dayfirst(False) + .build() ) + options.set_header(header) + plc_table_w_meta = plc.io.csv.read_csv(options) if header > 0: if header < len(pa_table): names_row = pa_table.take([header - 1]).to_pylist()[0].values() From 14a36e6fcc188201f645f0a6067ce369fc57a504 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 25 Nov 2024 10:24:48 -0800 Subject: [PATCH 07/11] minimize options structs in tests --- python/cudf/cudf/_lib/csv.pyx | 5 +- .../pylibcudf/pylibcudf/tests/io/test_csv.py | 111 +----------------- 2 files changed, 9 insertions(+), 107 deletions(-) diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index c839b5b38a0..d42aa8540ae 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -160,7 +160,9 @@ def read_csv( header = -1 elif header == 'infer': header = 0 + hex_cols = [] + new_dtypes = [] if dtype is not None: if isinstance(dtype, abc.Mapping): @@ -200,7 +202,6 @@ def read_csv( raise ValueError( "dtype should be a scalar/str/list-like/dict-like" ) - lineterminator = str(lineterminator) options = ( plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([datasource])) .compression(c_compression) @@ -211,7 +212,7 @@ def read_csv( .skiprows(skiprows) .skipfooter(skipfooter) .quoting(quoting) - .lineterminator(lineterminator) + .lineterminator(str(lineterminator)) .quotechar(quotechar) .decimal(decimal) .delim_whitespace(delim_whitespace) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py index 952847c16b2..2d02f5a4404 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_csv.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py @@ -80,23 +80,8 @@ def test_read_csv_basic( options = ( plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([source])) .compression(compression_type) - .mangle_dupe_cols(True) - .byte_range_offset(0) - .byte_range_size(0) .nrows(nrows) .skiprows(skiprows) - .skipfooter(0) - .quoting(0) - .lineterminator("\n") - .quotechar('"') - .decimal(".") - .delim_whitespace(False) - .skipinitialspace(False) - .skip_blank_lines(True) - .doublequote(True) - .keep_default_na(True) - .na_filter(True) - .dayfirst(False) .build() ) options.set_delimiter(delimiter) @@ -129,24 +114,8 @@ def test_read_csv_byte_range(table_data, chunk_size, tmp_path): for segment in range((file_size + chunk_size - 1) // chunk_size): options = ( plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([source])) - .compression(plc.io.types.CompressionType.AUTO) - .mangle_dupe_cols(True) .byte_range_offset(segment * chunk_size) .byte_range_size(chunk_size) - .nrows(-1) - .skiprows(0) - .skipfooter(0) - .quoting(0) - .lineterminator("\n") - .quotechar('"') - .decimal(".") - .delim_whitespace(False) - .skipinitialspace(False) - .skip_blank_lines(True) - .doublequote(True) - .keep_default_na(True) - .na_filter(True) - .dayfirst(False) .build() ) options.set_header(-1) @@ -194,28 +163,9 @@ def test_read_csv_dtypes(csv_table_data, source_or_sink, usecols): new_schema = pa.schema(new_fields) - options = ( - plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([source])) - .compression(plc.io.types.CompressionType.AUTO) - .mangle_dupe_cols(True) - .byte_range_offset(0) - .byte_range_size(0) - .nrows(-1) - .skiprows(0) - .skipfooter(0) - .quoting(0) - .lineterminator("\n") - .quotechar('"') - .decimal(".") - .delim_whitespace(False) - .skipinitialspace(False) - .skip_blank_lines(True) - .doublequote(True) - .keep_default_na(True) - .na_filter(True) - .dayfirst(False) - .build() - ) + options = plc.io.csv.CsvReaderOptions.builder( + plc.io.SourceInfo([source]) + ).build() options.set_dtypes(dtypes) if usecols is not None: if all([isinstance(col, int) for col in usecols]): @@ -251,24 +201,10 @@ def test_read_csv_parse_options( plc.io.csv.CsvReaderOptions.builder( plc.io.SourceInfo([source_or_sink]) ) - .compression(plc.io.types.CompressionType.AUTO) - .mangle_dupe_cols(True) - .byte_range_offset(0) - .byte_range_size(0) - .nrows(-1) - .skiprows(0) - .skipfooter(0) - .quoting(0) .lineterminator(lineterminator) .quotechar(quotechar) .decimal(decimal) - .delim_whitespace(False) - .skipinitialspace(False) .skip_blank_lines(skip_blanks) - .doublequote(True) - .keep_default_na(True) - .na_filter(True) - .dayfirst(False) .build() ) options.set_comment("#") @@ -299,24 +235,8 @@ def test_read_csv_na_values( plc.io.csv.CsvReaderOptions.builder( plc.io.SourceInfo([source_or_sink]) ) - .compression(plc.io.types.CompressionType.AUTO) - .mangle_dupe_cols(True) - .byte_range_offset(0) - .byte_range_size(0) - .nrows(-1) - .skiprows(0) - .skipfooter(0) - .quoting(0) - .lineterminator("\n") - .quotechar('"') - .decimal(".") - .delim_whitespace(False) - .skipinitialspace(False) - .skip_blank_lines(True) - .doublequote(True) .keep_default_na(keep_default_na) .na_filter(na_filter) - .dayfirst(False) .build() ) if na_filter and na_values is not None: @@ -341,28 +261,9 @@ def test_read_csv_header(csv_table_data, source_or_sink, header): **_COMMON_CSV_SOURCE_KWARGS, ) - options = ( - plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([source])) - .compression(plc.io.types.CompressionType.AUTO) - .mangle_dupe_cols(True) - .byte_range_offset(0) - .byte_range_size(0) - .nrows(-1) - .skiprows(0) - .skipfooter(0) - .quoting(0) - .lineterminator("\n") - .quotechar('"') - .decimal(".") - .delim_whitespace(False) - .skipinitialspace(False) - .skip_blank_lines(True) - .doublequote(True) - .keep_default_na(True) - .na_filter(True) - .dayfirst(False) - .build() - ) + options = plc.io.csv.CsvReaderOptions.builder( + plc.io.SourceInfo([source]) + ).build() options.set_header(header) plc_table_w_meta = plc.io.csv.read_csv(options) if header > 0: From 739e61fb5904700c023549e3eea0149179967592 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 25 Nov 2024 12:25:38 -0800 Subject: [PATCH 08/11] add docstrings, plumb through cudf polars, clean up --- python/cudf_polars/cudf_polars/dsl/ir.py | 40 ++- python/pylibcudf/pylibcudf/io/csv.pyx | 439 ++++++++++++++++++++++- 2 files changed, 446 insertions(+), 33 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 62a2da9dcea..1b6328af3ec 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -476,23 +476,31 @@ def do_evaluate( with path.open() as f: while f.readline() == "\n": skiprows += 1 - tbl_w_meta = plc.io.csv.read_csv( - plc.io.SourceInfo([path]), - delimiter=sep, - quotechar=quote, - lineterminator=eol, - col_names=column_names, - header=header, - usecols=usecols, - na_filter=True, - na_values=null_values, - keep_default_na=False, - skiprows=skiprows, - comment=comment, - decimal=decimal, - dtypes=schema, - nrows=n_rows, + options = ( + plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([path])) + .nrows(n_rows) + .skiprows(skiprows) + .lineterminator(str(eol)) + .quotechar(str(quote)) + .decimal(decimal) + .keep_default_na(keep_default_na=False) + .na_filter(na_filter=True) + .build() ) + options.set_delimiter(str(sep)) + if column_names is not None: + options.set_names([str(name) for name in column_names]) + options.set_header(header) + options.set_dtypes(schema) + if usecols is not None: + if all(isinstance(col, int) for col in usecols): + options.set_use_cols_indexes(list(usecols)) + else: + options.set_use_cols_names([str(name) for name in usecols]) + options.set_na_values(null_values) + if comment is not None: + options.set_comment(comment) + tbl_w_meta = plc.io.csv.read_csv(options) pieces.append(tbl_w_meta) if read_partial: n_rows -= tbl_w_meta.tbl.num_rows() diff --git a/python/pylibcudf/pylibcudf/io/csv.pyx b/python/pylibcudf/pylibcudf/io/csv.pyx index 507da3b33c2..0cafcfe8d1d 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pyx +++ b/python/pylibcudf/pylibcudf/io/csv.pyx @@ -32,26 +32,27 @@ __all__ = [ "CsvReaderOptionsBuilder", ] -cdef tuple _process_parse_dates_hex(list cols): - cdef vector[string] str_cols - cdef vector[int] int_cols - for col in cols: - if isinstance(col, str): - str_cols.push_back(col.encode()) - else: - int_cols.push_back(col) - return str_cols, int_cols - -cdef vector[string] _make_str_vector(list vals): - cdef vector[string] res - for val in vals: - res.push_back((val).encode()) - return res - - cdef class CsvReaderOptions: + """The settings to use for ``read_csv`` + For details, see :cpp:class:`cudf::io::csv_reader_options` + """ @staticmethod def builder(SourceInfo source): + """ + Create a CsvWriterOptionsBuilder object + + For details, see :cpp:func:`cudf::io::csv_reader_options::builder` + + Parameters + ---------- + sink : SourceInfo + The source to read the CSV file from. + + Returns + ------- + CsvReaderOptionsBuilder + Builder to build CsvReaderOptions + """ cdef CsvReaderOptionsBuilder csv_builder = CsvReaderOptionsBuilder.__new__( CsvReaderOptionsBuilder ) @@ -60,39 +61,147 @@ cdef class CsvReaderOptions: return csv_builder cpdef void set_header(self, size_type header): + """ + Sets header row index. + + Parameters + ---------- + header : size_type + Index where header row is located + + Returns + ------- + None + """ self.c_obj.set_header(header) cpdef void set_names(self, list col_names): + """ + Sets names of the column. + + Parameters + ---------- + col_names : list[str] + List of column names + + Returns + ------- + None + """ cdef vector[string] vec for name in col_names: vec.push_back(name.encode()) self.c_obj.set_names(vec) cpdef void set_prefix(self, str prefix): + """ + Sets prefix to be used for column ID. + + Parameters + ---------- + prefix : str + String used as prefix in for each column name + + Returns + ------- + None + """ self.c_obj.set_prefix(prefix.encode()) cpdef void set_use_cols_indexes(self, list col_indices): + """ + Sets indexes of columns to read. + + Parameters + ---------- + col_indices : list[int] + List of column indices that are needed + + Returns + ------- + None + """ cdef vector[int] vec for i in col_indices: vec.push_back(i) self.c_obj.set_use_cols_indexes(vec) cpdef void set_use_cols_names(self, list col_names): + """ + Sets names of the columns to be read. + + Parameters + ---------- + col_names : list[str] + List of column indices that are needed + + Returns + ------- + None + """ cdef vector[string] vec for name in col_names: vec.push_back(name.encode()) self.c_obj.set_use_cols_names(vec) cpdef void set_delimiter(self, str delimiter): + """ + Sets field delimiter. + + Parameters + ---------- + delimiter : str + A character to indicate delimiter + + Returns + ------- + None + """ self.c_obj.set_delimiter(ord(delimiter)) cpdef void set_thousands(self, str thousands): + """ + Sets numeric data thousands separator. + + Parameters + ---------- + thousands : str + A character that separates thousands + + Returns + ------- + None + """ self.c_obj.set_thousands(ord(thousands)) cpdef void set_comment(self, str comment): + """ + Sets comment line start character. + + Parameters + ---------- + comment : str + A character that indicates comment + + Returns + ------- + None + """ self.c_obj.set_comment(ord(comment)) cpdef void set_parse_dates(self, list val): + """ + Sets indexes or names of columns to read as datetime. + + Parameters + ---------- + val : list[int | str] + List column indices or names to infer as datetime. + + Returns + ------- + None + """ cdef vector[string] vec_str cdef vector[int] vec_int if not all([isinstance(col, (str, int)) for col in val]): @@ -107,6 +216,18 @@ cdef class CsvReaderOptions: self.c_obj.set_parse_dates(vec_int) cpdef void set_parse_hex(self, list val): + """ + Sets indexes or names of columns to parse as hexadecimal. + + Parameters + ---------- + val : list[int | str] + List of column indices or names to parse as hexadecimal + + Returns + ------- + None + """ cdef vector[string] vec_str cdef vector[int] vec_int if not all([isinstance(col, (str, int)) for col in val]): @@ -122,6 +243,19 @@ cdef class CsvReaderOptions: self.c_obj.set_parse_hex(vec_int) cpdef void set_dtypes(self, object types): + """ + Sets per-column types. + + Parameters + ---------- + types : dict[str, data_type] | list[data_type] + Column name to data type map specifying the columns' target data types. + Or a list specifying the columns' target data types. + + Returns + ------- + None + """ cdef map[string, data_type] dtype_map cdef vector[data_type] dtype_list if isinstance(types, dict): @@ -136,18 +270,54 @@ cdef class CsvReaderOptions: raise TypeError("Must pass an dict or list") cpdef void set_true_values(self, list true_values): + """ + Sets additional values to recognize as boolean true values. + + Parameters + ---------- + true_values : list[str] + List of values to be considered to be true + + Returns + ------- + None + """ cdef vector[string] vec for val in true_values: vec.push_back(val.encode()) self.c_obj.set_true_values(vec) cpdef void set_false_values(self, list false_values): + """ + Sets additional values to recognize as boolean false values. + + Parameters + ---------- + false_values : list[str] + List of values to be considered to be false + + Returns + ------- + None + """ cdef vector[string] vec for val in false_values: vec.push_back(val.encode()) self.c_obj.set_false_values(vec) cpdef void set_na_values(self, list na_values): + """ + Sets additional values to recognize as null values. + + Parameters + ---------- + na_values : list[str] + List of values to be considered to be null + + Returns + ------- + None + """ cdef vector[string] vec for val in na_values: vec.push_back(val.encode()) @@ -155,79 +325,301 @@ cdef class CsvReaderOptions: cdef class CsvReaderOptionsBuilder: + """ + Builder to build options for ``read_csv`` + + For details, see :cpp:class:`cudf::io::csv_reader_options_builder` + """ cpdef CsvReaderOptionsBuilder compression(self, compression_type compression): + """ + Sets compression format of the source. + + Parameters + ---------- + compression : compression_type + Compression type + + Returns + ------- + CsvReaderOptionsBuilder + """ self.c_obj.compression(compression) return self cpdef CsvReaderOptionsBuilder mangle_dupe_cols(self, bool mangle_dupe_cols): + """ + Sets whether to rename duplicate column names. + + Parameters + ---------- + mangle_dupe_cols : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ self.c_obj.mangle_dupe_cols(mangle_dupe_cols) return self cpdef CsvReaderOptionsBuilder byte_range_offset(self, size_t byte_range_offset): + """ + Sets number of bytes to skip from source start. + + Parameters + ---------- + byte_range_offset : size_t + Number of bytes of offset + + Returns + ------- + CsvReaderOptionsBuilder + """ self.c_obj.byte_range_offset(byte_range_offset) return self cpdef CsvReaderOptionsBuilder byte_range_size(self, size_t byte_range_size): + """ + Sets number of bytes to read. + + Parameters + ---------- + byte_range_offset : size_t + Number of bytes to read + + Returns + ------- + CsvReaderOptionsBuilder + """ self.c_obj.byte_range_size(byte_range_size) return self cpdef CsvReaderOptionsBuilder nrows(self, size_type nrows): + """ + Sets number of rows to read. + + Parameters + ---------- + nrows : size_type + Number of rows to read + + Returns + ------- + CsvReaderOptionsBuilder + """ self.c_obj.nrows(nrows) return self cpdef CsvReaderOptionsBuilder skiprows(self, size_type skiprows): + """ + Sets number of rows to skip from start. + + Parameters + ---------- + skiprows : size_type + Number of rows to skip + + Returns + ------- + CsvReaderOptionsBuilder + """ self.c_obj.skiprows(skiprows) return self cpdef CsvReaderOptionsBuilder skipfooter(self, size_type skipfooter): + """ + Sets number of rows to skip from end. + + Parameters + ---------- + skipfooter : size_type + Number of rows to skip + + Returns + ------- + CsvReaderOptionsBuilder + """ self.c_obj.skipfooter(skipfooter) return self cpdef CsvReaderOptionsBuilder quoting(self, quote_style quoting): + """ + Sets quoting style. + + Parameters + ---------- + quoting : quote_style + Quoting style used + + Returns + ------- + CsvReaderOptionsBuilder + """ self.c_obj.quoting(quoting) return self cpdef CsvReaderOptionsBuilder lineterminator(self, str lineterminator): + """ + Sets line terminator. + + Parameters + ---------- + quoting : str + A character to indicate line termination + + Returns + ------- + CsvReaderOptionsBuilder + """ self.c_obj.lineterminator(ord(lineterminator)) return self cpdef CsvReaderOptionsBuilder quotechar(self, str quotechar): + """ + Sets quoting character. + + Parameters + ---------- + quotechar : str + A character to indicate quoting + + Returns + ------- + CsvReaderOptionsBuilder + """ self.c_obj.quotechar(ord(quotechar)) return self cpdef CsvReaderOptionsBuilder decimal(self, str decimal): + """ + Sets decimal point character. + + Parameters + ---------- + quotechar : str + A character that indicates decimal values + + Returns + ------- + CsvReaderOptionsBuilder + """ self.c_obj.decimal(ord(decimal)) return self cpdef CsvReaderOptionsBuilder delim_whitespace(self, bool delim_whitespace): + """ + Sets whether to treat whitespace as field delimiter. + + Parameters + ---------- + delim_whitespace : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ self.c_obj.delim_whitespace(delim_whitespace) return self cpdef CsvReaderOptionsBuilder skipinitialspace(self, bool skipinitialspace): + """ + Sets whether to skip whitespace after the delimiter. + + Parameters + ---------- + skipinitialspace : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ self.c_obj.skipinitialspace(skipinitialspace) return self cpdef CsvReaderOptionsBuilder skip_blank_lines(self, bool skip_blank_lines): + """ + Sets whether to ignore empty lines or parse line values as invalid. + + Parameters + ---------- + skip_blank_lines : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ self.c_obj.skip_blank_lines(skip_blank_lines) return self cpdef CsvReaderOptionsBuilder doublequote(self, bool doublequote): + """ + Sets a quote inside a value is double-quoted. + + Parameters + ---------- + doublequote : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ self.c_obj.doublequote(doublequote) return self cpdef CsvReaderOptionsBuilder keep_default_na(self, bool keep_default_na): + """ + Sets whether to keep the built-in default NA values. + + Parameters + ---------- + keep_default_na : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ self.c_obj.keep_default_na(keep_default_na) return self cpdef CsvReaderOptionsBuilder na_filter(self, bool na_filter): + """ + Sets whether to disable null filter. + + Parameters + ---------- + na_filter : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ self.c_obj.na_filter(na_filter) return self cpdef CsvReaderOptionsBuilder dayfirst(self, bool dayfirst): + """ + Sets whether to parse dates as DD/MM versus MM/DD. + + Parameters + ---------- + dayfirst : bool + Boolean value to enable/disable + + Returns + ------- + CsvReaderOptionsBuilder + """ self.c_obj.dayfirst(dayfirst) return self cpdef CsvReaderOptions build(self): + """Create a CsvReaderOptions object""" cdef CsvReaderOptions csv_options = CsvReaderOptions.__new__( CsvReaderOptions ) @@ -239,6 +631,19 @@ cdef class CsvReaderOptionsBuilder: def read_csv( CsvReaderOptions options ): + """ + Read from CSV format. + + The source to read from and options are encapsulated + by the `options` object. + + For details, see :cpp:func:`read_csv`. + + Parameters + ---------- + options: CsvReaderOptions + Settings for controlling reading behavior + """ cdef table_with_metadata c_result with nogil: c_result = move(cpp_read_csv(options.c_obj)) From 187a43f9307811e1f5768a88d912f7f0d5c2f8e8 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 25 Nov 2024 12:31:54 -0800 Subject: [PATCH 09/11] cpdef read_csv --- python/pylibcudf/pylibcudf/io/csv.pxd | 4 +++- python/pylibcudf/pylibcudf/io/csv.pyx | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/pylibcudf/pylibcudf/io/csv.pxd b/python/pylibcudf/pylibcudf/io/csv.pxd index 8515aa23b53..95f3ff4fe45 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pxd +++ b/python/pylibcudf/pylibcudf/io/csv.pxd @@ -9,7 +9,7 @@ from pylibcudf.libcudf.io.csv cimport ( csv_reader_options, csv_reader_options_builder, ) -from pylibcudf.io.types cimport SinkInfo, SourceInfo +from pylibcudf.io.types cimport SinkInfo, SourceInfo, TableWithMetadata from pylibcudf.table cimport Table from pylibcudf.libcudf.io.types cimport ( @@ -61,6 +61,8 @@ cdef class CsvReaderOptionsBuilder: cpdef CsvReaderOptionsBuilder dayfirst(self, bool dayfirst) cpdef CsvReaderOptions build(self) +cpdef TableWithMetadata read_csv(CsvReaderOptions options) + cdef class CsvWriterOptions: cdef csv_writer_options c_obj cdef Table table diff --git a/python/pylibcudf/pylibcudf/io/csv.pyx b/python/pylibcudf/pylibcudf/io/csv.pyx index 0cafcfe8d1d..efc9bb813a1 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pyx +++ b/python/pylibcudf/pylibcudf/io/csv.pyx @@ -628,7 +628,7 @@ cdef class CsvReaderOptionsBuilder: return csv_options -def read_csv( +cpdef TableWithMetadata read_csv( CsvReaderOptions options ): """ From 31d121211718dd68020dcaf5ff7d9f251991012c Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 25 Nov 2024 17:48:01 -0800 Subject: [PATCH 10/11] usecols is a list of strings --- python/cudf_polars/cudf_polars/dsl/ir.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 1b6328af3ec..a5441e9d59f 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -493,10 +493,7 @@ def do_evaluate( options.set_header(header) options.set_dtypes(schema) if usecols is not None: - if all(isinstance(col, int) for col in usecols): - options.set_use_cols_indexes(list(usecols)) - else: - options.set_use_cols_names([str(name) for name in usecols]) + options.set_use_cols_names([str(name) for name in usecols]) options.set_na_values(null_values) if comment is not None: options.set_comment(comment) From db196206b0a3432bade8c34f14e02b96c470c383 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 26 Nov 2024 12:11:46 -0800 Subject: [PATCH 11/11] address review --- python/cudf/cudf/_lib/csv.pyx | 2 +- python/pylibcudf/pylibcudf/io/csv.pyi | 3 ++- python/pylibcudf/pylibcudf/tests/io/test_csv.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index d42aa8540ae..641fc18c203 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -234,7 +234,7 @@ def read_csv( options.set_prefix(prefix) if usecols is not None: - if all([isinstance(col, int) for col in usecols]): + if all(isinstance(col, int) for col in usecols): options.set_use_cols_indexes(list(usecols)) else: options.set_use_cols_names([str(name) for name in usecols]) diff --git a/python/pylibcudf/pylibcudf/io/csv.pyi b/python/pylibcudf/pylibcudf/io/csv.pyi index adfd29eb530..540cbc778ea 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pyi +++ b/python/pylibcudf/pylibcudf/io/csv.pyi @@ -1,7 +1,8 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from collections.abc import Mapping -from typing import Self + +from typing_extensions import Self from pylibcudf.io.types import ( CompressionType, diff --git a/python/pylibcudf/pylibcudf/tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py index 2d02f5a4404..1cbaac57315 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_csv.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py @@ -168,7 +168,7 @@ def test_read_csv_dtypes(csv_table_data, source_or_sink, usecols): ).build() options.set_dtypes(dtypes) if usecols is not None: - if all([isinstance(col, int) for col in usecols]): + if all(isinstance(col, int) for col in usecols): options.set_use_cols_indexes(list(usecols)) else: options.set_use_cols_names([str(name) for name in usecols])