diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index a689cfbcb1418..76252b8d86c5b 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -39,6 +39,7 @@ ) from pandas._libs.lib import is_string_array from pandas._libs.tslibs import timezones +from pandas.compat import HAS_PYARROW from pandas.compat._optional import import_optional_dependency from pandas.compat.pickle_compat import patch_pickle from pandas.errors import ( @@ -376,6 +377,13 @@ def read_hdf( object The selected object. Return type depends on the object stored. + Notes + ----- + When ``errors="surrogatepass"``, ``pd.options.future.infer_string`` is true, + and PyArrow is installed, if a UTF-16 surrogate is encountered when decoding + to UTF-8, the resulting dtype will be + ``pd.StringDtype(storage="python", na_value=np.nan)``. + See Also -------- DataFrame.to_hdf : Write a HDF file from a DataFrame. @@ -2257,6 +2265,20 @@ def convert( # making an Index instance could throw a number of different errors try: new_pd_index = factory(values, **kwargs) + except UnicodeEncodeError as err: + if ( + errors == "surrogatepass" + and get_option("future.infer_string") + and str(err).endswith("surrogates not allowed") + and HAS_PYARROW + ): + new_pd_index = factory( + values, + dtype=StringDtype(storage="python", na_value=np.nan), + **kwargs, + ) + else: + raise except ValueError: # if the output freq is different that what we recorded, # it should be None (see also 'doc example part 2') @@ -3170,12 +3192,29 @@ def read_index_node( **kwargs, ) else: - index = factory( - _unconvert_index( - data, kind, encoding=self.encoding, errors=self.errors - ), - **kwargs, - ) + try: + index = factory( + _unconvert_index( + data, kind, encoding=self.encoding, errors=self.errors + ), + **kwargs, + ) + except UnicodeEncodeError as err: + if ( + self.errors == "surrogatepass" + and get_option("future.infer_string") + and str(err).endswith("surrogates not allowed") + and HAS_PYARROW + ): + index = factory( + _unconvert_index( + data, kind, encoding=self.encoding, errors=self.errors + ), + dtype=StringDtype(storage="python", na_value=np.nan), + **kwargs, + ) + else: + raise index.name = name @@ -3311,13 +3350,24 @@ def read( self.validate_read(columns, where) index = self.read_index("index", start=start, stop=stop) values = self.read_array("values", start=start, stop=stop) - result = Series(values, index=index, name=self.name, copy=False) - if ( - using_string_dtype() - and isinstance(values, np.ndarray) - and is_string_array(values, skipna=True) - ): - result = result.astype(StringDtype(na_value=np.nan)) + try: + result = Series(values, index=index, name=self.name, copy=False) + except UnicodeEncodeError as err: + if ( + self.errors == "surrogatepass" + and get_option("future.infer_string") + and str(err).endswith("surrogates not allowed") + and HAS_PYARROW + ): + result = Series( + values, + index=index, + name=self.name, + copy=False, + dtype=StringDtype(storage="python", na_value=np.nan), + ) + else: + raise return result def write(self, obj, **kwargs) -> None: @@ -4764,7 +4814,24 @@ def read( values = values.reshape((1, values.shape[0])) if isinstance(values, (np.ndarray, DatetimeArray)): - df = DataFrame(values.T, columns=cols_, index=index_, copy=False) + try: + df = DataFrame(values.T, columns=cols_, index=index_, copy=False) + except UnicodeEncodeError as err: + if ( + self.errors == "surrogatepass" + and get_option("future.infer_string") + and str(err).endswith("surrogates not allowed") + and HAS_PYARROW + ): + df = DataFrame( + values.T, + columns=cols_, + index=index_, + copy=False, + dtype=StringDtype(storage="python", na_value=np.nan), + ) + else: + raise elif isinstance(values, Index): df = DataFrame(values, columns=cols_, index=index_) else: @@ -4774,23 +4841,10 @@ def read( assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) # If str / string dtype is stored in meta, use that. - converted = False for column in cols_: dtype = getattr(self.table.attrs, f"{column}_meta", None) if dtype in ["str", "string"]: df[column] = df[column].astype(dtype) - converted = True - # Otherwise try inference. - if ( - not converted - and using_string_dtype() - and isinstance(values, np.ndarray) - and is_string_array( - values, - skipna=True, - ) - ): - df = df.astype(StringDtype(na_value=np.nan)) frames.append(df) if len(frames) == 1: @@ -5224,7 +5278,7 @@ def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.nd # encode if needed if len(data): data = ( - Series(data.ravel(), copy=False) + Series(data.ravel(), copy=False, dtype="object") .str.encode(encoding, errors) ._values.reshape(data.shape) ) @@ -5264,7 +5318,9 @@ def _unconvert_string_array( dtype = f"U{itemsize}" if isinstance(data[0], bytes): - ser = Series(data, copy=False).str.decode(encoding, errors=errors) + ser = Series(data, copy=False).str.decode( + encoding, errors=errors, dtype="object" + ) data = ser.to_numpy() data.flags.writeable = True else: diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index bb2058c050f2a..b3ab6b48508e1 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import PY312 import pandas as pd @@ -25,7 +23,6 @@ timedelta_range, ) import pandas._testing as tm -from pandas.conftest import has_pyarrow from pandas.tests.io.pytables.common import ( _maybe_remove, ensure_clean_store, @@ -385,20 +382,24 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path): tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]])) -@pytest.mark.xfail( - using_string_dtype() and has_pyarrow, - reason="TODO(infer_string): can't encode '\ud800': surrogates not allowed", -) @pytest.mark.parametrize("format", ["fixed", "table"]) -def test_to_hdf_errors(tmp_path, format, setup_path): +def test_to_hdf_errors(tmp_path, format, setup_path, using_infer_string): data = ["\ud800foo"] - ser = Series(data, index=Index(data)) + ser = Series(data, index=Index(data, dtype="object"), dtype="object") path = tmp_path / setup_path # GH 20835 ser.to_hdf(path, key="table", format=format, errors="surrogatepass") result = read_hdf(path, "table", errors="surrogatepass") - tm.assert_series_equal(result, ser) + + if using_infer_string: + # https://github.com/pandas-dev/pandas/pull/60993 + # Surrogates fallback to python storage. + dtype = pd.StringDtype(storage="python", na_value=np.nan) + else: + dtype = "object" + expected = Series(data, index=Index(data, dtype=dtype), dtype=dtype) + tm.assert_series_equal(result, expected) def test_create_table_index(setup_path):