Skip to content

Commit

Permalink
ENH: Improved error message and raise new error for small-string NaN …
Browse files Browse the repository at this point in the history
…edge case in HDFStore.append (#60829)

* Add clearer error messages for datatype mismatch in HDFStore.append. Raise ValueError when nan_rep too large for pytable column. Add and modify applicable test code.

* Fix missed tests and correct mistake in error message.

* Remove excess comments. Reverse error type change to avoid api changes. Move nan_rep tests into separate function.
  • Loading branch information
JakeTT404 authored Feb 5, 2025
1 parent 51b12e8 commit 57340ec
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 12 deletions.
9 changes: 9 additions & 0 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -3524,6 +3524,12 @@ def validate(self, other) -> None:
# Value of type "Optional[Any]" is not indexable [index]
oax = ov[i] # type: ignore[index]
if sax != oax:
if c == "values_axes" and sax.kind != oax.kind:
raise ValueError(
f"Cannot serialize the column [{oax.values[0]}] "
f"because its data contents are not [{sax.kind}] "
f"but [{oax.kind}] object dtype"
)
raise ValueError(
f"invalid combination of [{c}] on appending data "
f"[{sax}] vs current table [{oax}]"
Expand Down Expand Up @@ -5136,6 +5142,9 @@ def _maybe_convert_for_string_atom(
data = bvalues.copy()
data[mask] = nan_rep

if existing_col and mask.any() and len(nan_rep) > existing_col.itemsize:
raise ValueError("NaN representation is too large for existing column size")

# see if we have a valid string type
inferred_type = lib.infer_dtype(data, skipna=False)
if inferred_type != "string":
Expand Down
35 changes: 29 additions & 6 deletions pandas/tests/io/pytables/test_append.py
Original file line number Diff line number Diff line change
Expand Up @@ -823,12 +823,9 @@ def test_append_raise(setup_path):
store.append("df", df)
df["foo"] = "bar"
msg = re.escape(
"invalid combination of [values_axes] on appending data "
"[name->values_block_1,cname->values_block_1,"
"dtype->bytes24,kind->string,shape->(1, 30)] "
"vs current table "
"[name->values_block_1,cname->values_block_1,"
"dtype->datetime64[s],kind->datetime64[s],shape->None]"
"Cannot serialize the column [foo] "
"because its data contents are not [string] "
"but [datetime64[s]] object dtype"
)
with pytest.raises(ValueError, match=msg):
store.append("df", df)
Expand Down Expand Up @@ -997,3 +994,29 @@ def test_append_to_multiple_min_itemsize(setup_path):
)
result = store.select_as_multiple(["index", "nums", "strs"])
tm.assert_frame_equal(result, expected, check_index_type=True)


def test_append_string_nan_rep(setup_path):
# GH 16300
df = DataFrame({"A": "a", "B": "foo"}, index=np.arange(10))
df_nan = df.copy()
df_nan.loc[0:4, :] = np.nan
msg = "NaN representation is too large for existing column size"

with ensure_clean_store(setup_path) as store:
# string column too small
store.append("sa", df["A"])
with pytest.raises(ValueError, match=msg):
store.append("sa", df_nan["A"])

# nan_rep too big
store.append("sb", df["B"], nan_rep="bars")
with pytest.raises(ValueError, match=msg):
store.append("sb", df_nan["B"])

# smaller modified nan_rep
store.append("sc", df["A"], nan_rep="n")
store.append("sc", df_nan["A"])
result = store["sc"]
expected = concat([df["A"], df_nan["A"]])
tm.assert_series_equal(result, expected)
9 changes: 3 additions & 6 deletions pandas/tests/io/pytables/test_round_trip.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,12 +213,9 @@ def test_table_values_dtypes_roundtrip(setup_path):

# incompatible dtype
msg = re.escape(
"invalid combination of [values_axes] on appending data "
"[name->values_block_0,cname->values_block_0,"
"dtype->float64,kind->float,shape->(1, 3)] vs "
"current table [name->values_block_0,"
"cname->values_block_0,dtype->int64,kind->integer,"
"shape->None]"
"Cannot serialize the column [a] "
"because its data contents are not [float] "
"but [integer] object dtype"
)
with pytest.raises(ValueError, match=msg):
store.append("df_i8", df1)
Expand Down

0 comments on commit 57340ec

Please sign in to comment.