Skip to content

Commit

Permalink
GH-33592: [C++] support casting nullable fields to non-nullable if th…
Browse files Browse the repository at this point in the history
…ere are no null values (#43782)

* GitHub Issue: #33592

Notes for myself/fixer:
- [tests that need to get updated](https://github.com/search?q=repo%3Aapache%2Farrow%20%22cannot%20cast%20nullable%20field%22&type=code) (almost definitely not a complete list)
- [update: actually we should handle the go implementation in the go repository.] hmm, looks like [go wrapper does its own nullability checks](https://github.com/apache/arrow/blob/f078942ce2df68de8f48c3b4233132133601ca53/go/arrow/compute/cast.go#L283-L286). I assume this is just an optimization to not have to go into the C++ and hit the error down there. So if we just delete this check then I think the C++ logic will handle all of it???
- [how the plain column handles this cast](https://github.com/apache/arrow/blob/f078942ce2df68de8f48c3b4233132133601ca53/python/pyarrow/table.pxi#L3238-L3243), some logic like this probably needs to get ported over to the struct implementation
- (running from /cpp/build) `cmake .. --preset ninja-debug-basic`, then `cmake --build . && PYTHON=python ctest -R 'arrow-compute-scalar-cast-test' --output-on-failure` to run the specific test
-  to format: `uvx pre-commit run --all-files clang-format`

Lead-authored-by: Nick Crews <nicholas.b.crews@gmail.com>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
NickCrews and pitrou authored Feb 13, 2025
1 parent 251758d commit 6a47e4d
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 81 deletions.
30 changes: 18 additions & 12 deletions cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
Original file line number Diff line number Diff line change
Expand Up @@ -363,10 +363,6 @@ struct CastStruct {
const auto& in_field = in_type.field(in_field_index);
// If there are more in_fields check if they match the out_field.
if (in_field->name() == out_field->name()) {
if (in_field->nullable() && !out_field->nullable()) {
return Status::TypeError("cannot cast nullable field to non-nullable field: ",
in_type.ToString(), " ", out_type.ToString());
}
// Found matching in_field and out_field.
fields_to_select[out_field_index++] = in_field_index;
// Using the same in_field for multiple out_fields is not allowed.
Expand Down Expand Up @@ -403,17 +399,27 @@ struct CastStruct {
}

int out_field_index = 0;
for (int field_index : fields_to_select) {
const auto& target_type = out->type()->field(out_field_index++)->type();
if (field_index == kFillNullSentinel) {
ARROW_ASSIGN_OR_RAISE(auto nulls,
MakeArrayOfNull(target_type->GetSharedPtr(), batch.length));
for (int in_field_index : fields_to_select) {
const auto& out_field = out_type.field(out_field_index++);
const auto& out_field_type = out_field->type();
if (in_field_index == kFillNullSentinel) {
ARROW_ASSIGN_OR_RAISE(
auto nulls, MakeArrayOfNull(out_field_type->GetSharedPtr(), batch.length,
ctx->memory_pool()));
out_array->child_data.push_back(nulls->data());
} else {
const auto& values = (in_array.child_data[field_index].ToArrayData()->Slice(
const auto& in_field = in_type.field(in_field_index);
const auto& in_values = (in_array.child_data[in_field_index].ToArrayData()->Slice(
in_array.offset, in_array.length));
ARROW_ASSIGN_OR_RAISE(Datum cast_values,
Cast(values, target_type, options, ctx->exec_context()));
if (in_field->nullable() && !out_field->nullable() &&
in_values->GetNullCount() > 0) {
return Status::Invalid(
"field '", in_field->name(), "' of type ", in_field->type()->ToString(),
" has nulls. Can't cast to non-nullable field '", out_field->name(),
"' of type ", out_field_type->ToString());
}
ARROW_ASSIGN_OR_RAISE(Datum cast_values, Cast(in_values, out_field_type, options,
ctx->exec_context()));
DCHECK(cast_values.is_array());
out_array->child_data.push_back(cast_values.array());
}
Expand Down
141 changes: 72 additions & 69 deletions cpp/src/arrow/compute/kernels/scalar_cast_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4076,92 +4076,95 @@ TEST(Cast, StructToBiggerNullableStruct) {
TEST(Cast, StructToDifferentNullabilityStruct) {
{
// OK to go from non-nullable to nullable...
std::vector<std::shared_ptr<Field>> fields_src_non_nullable = {
std::vector<std::shared_ptr<Field>> fields_src = {
std::make_shared<Field>("a", int8(), false),
std::make_shared<Field>("b", int8(), false),
std::make_shared<Field>("c", int8(), false)};
std::shared_ptr<Array> a_src_non_nullable, b_src_non_nullable, c_src_non_nullable;
a_src_non_nullable = ArrayFromJSON(int8(), "[11, 23, 56]");
b_src_non_nullable = ArrayFromJSON(int8(), "[32, 46, 37]");
c_src_non_nullable = ArrayFromJSON(int8(), "[95, 11, 44]");
ASSERT_OK_AND_ASSIGN(
auto src_non_nullable,
StructArray::Make({a_src_non_nullable, b_src_non_nullable, c_src_non_nullable},
fields_src_non_nullable));

std::shared_ptr<Array> a_dest_nullable, b_dest_nullable, c_dest_nullable;
a_dest_nullable = ArrayFromJSON(int64(), "[11, 23, 56]");
b_dest_nullable = ArrayFromJSON(int64(), "[32, 46, 37]");
c_dest_nullable = ArrayFromJSON(int64(), "[95, 11, 44]");

std::vector<std::shared_ptr<Field>> fields_dest1_nullable = {
std::vector<std::shared_ptr<Array>> arrays_src = {
ArrayFromJSON(int8(), "[11, 23, 56]"),
ArrayFromJSON(int8(), "[32, 46, 37]"),
ArrayFromJSON(int8(), "[95, 11, 44]"),
};
ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make(arrays_src, fields_src));

std::vector<std::shared_ptr<Field>> fields_dest = {
std::make_shared<Field>("a", int64(), true),
std::make_shared<Field>("b", int64(), true),
std::make_shared<Field>("c", int64(), true)};
ASSERT_OK_AND_ASSIGN(
auto dest1_nullable,
StructArray::Make({a_dest_nullable, b_dest_nullable, c_dest_nullable},
fields_dest1_nullable));
CheckCast(src_non_nullable, dest1_nullable);

std::vector<std::shared_ptr<Field>> fields_dest2_nullable = {
std::make_shared<Field>("a", int64(), true),
std::make_shared<Field>("c", int64(), true)};
ASSERT_OK_AND_ASSIGN(
auto dest2_nullable,
StructArray::Make({a_dest_nullable, c_dest_nullable}, fields_dest2_nullable));
CheckCast(src_non_nullable, dest2_nullable);

std::vector<std::shared_ptr<Field>> fields_dest3_nullable = {
std::make_shared<Field>("b", int64(), true)};
ASSERT_OK_AND_ASSIGN(auto dest3_nullable,
StructArray::Make({b_dest_nullable}, fields_dest3_nullable));
CheckCast(src_non_nullable, dest3_nullable);
std::make_shared<Field>("c", int64(), true),
};
std::vector<std::shared_ptr<Array>> arrays_dest = {
ArrayFromJSON(int64(), "[11, 23, 56]"),
ArrayFromJSON(int64(), "[32, 46, 37]"),
ArrayFromJSON(int64(), "[95, 11, 44]"),
};
ASSERT_OK_AND_ASSIGN(auto dest, StructArray::Make(arrays_dest, fields_dest));
CheckCast(src, dest);

std::vector<std::shared_ptr<Field>> fields_dest_ac = {fields_dest[0], fields_dest[2]};
std::vector<std::shared_ptr<Array>> arrays_dest_ac = {arrays_dest[0], arrays_dest[2]};
ASSERT_OK_AND_ASSIGN(auto dest_ac, StructArray::Make(arrays_dest_ac, fields_dest_ac));
CheckCast(src, dest_ac);

std::vector<std::shared_ptr<Field>> fields_dest_b = {fields_dest[1]};
std::vector<std::shared_ptr<Array>> arrays_dest_b = {arrays_dest[1]};
ASSERT_OK_AND_ASSIGN(auto dest_b, StructArray::Make(arrays_dest_b, fields_dest_b));
CheckCast(src, dest_b);
}
{
// But NOT OK to go from nullable to non-nullable...
std::vector<std::shared_ptr<Field>> fields_src_nullable = {
// But when going from nullable to non-nullable, all data must be non-null...
std::vector<std::shared_ptr<Field>> fields_src = {
std::make_shared<Field>("a", int8(), true),
std::make_shared<Field>("b", int8(), true),
std::make_shared<Field>("c", int8(), true)};
std::shared_ptr<Array> a_src_nullable, b_src_nullable, c_src_nullable;
a_src_nullable = ArrayFromJSON(int8(), "[1, null, 5]");
b_src_nullable = ArrayFromJSON(int8(), "[3, 4, null]");
c_src_nullable = ArrayFromJSON(int8(), "[9, 11, 44]");
ASSERT_OK_AND_ASSIGN(
auto src_nullable,
StructArray::Make({a_src_nullable, b_src_nullable, c_src_nullable},
fields_src_nullable));

std::vector<std::shared_ptr<Field>> fields_dest1_non_nullable = {
std::vector<std::shared_ptr<Array>> arrays_src = {
ArrayFromJSON(int8(), "[1, null, 5]"),
ArrayFromJSON(int8(), "[3, 4, null]"),
ArrayFromJSON(int8(), "[9, 11, 44]"),
};
ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make(arrays_src, fields_src));

std::vector<std::shared_ptr<Field>> fields_dest = {
std::make_shared<Field>("a", int64(), false),
std::make_shared<Field>("b", int64(), false),
std::make_shared<Field>("c", int64(), false)};
const auto dest1_non_nullable = arrow::struct_(fields_dest1_non_nullable);
const auto options1_non_nullable = CastOptions::Safe(dest1_non_nullable);
EXPECT_RAISES_WITH_MESSAGE_THAT(
TypeError,
::testing::HasSubstr("cannot cast nullable field to non-nullable field"),
Cast(src_nullable, options1_non_nullable));
Invalid,
::testing::HasSubstr(
"field 'a' of type int8 has nulls. Can't cast to non-nullable field 'a' "
"of type int64"),
Cast(src, CastOptions::Safe(arrow::struct_(fields_dest))));

std::vector<std::shared_ptr<Field>> fields_dest2_non_nullable = {
std::make_shared<Field>("a", int64(), false),
std::make_shared<Field>("c", int64(), false)};
const auto dest2_non_nullable = arrow::struct_(fields_dest2_non_nullable);
const auto options2_non_nullable = CastOptions::Safe(dest2_non_nullable);
std::vector<std::shared_ptr<Field>> fields_dest_ac = {fields_dest[0], fields_dest[2]};
EXPECT_RAISES_WITH_MESSAGE_THAT(
TypeError,
::testing::HasSubstr("cannot cast nullable field to non-nullable field"),
Cast(src_nullable, options2_non_nullable));

std::vector<std::shared_ptr<Field>> fields_dest3_non_nullable = {
std::make_shared<Field>("c", int64(), false)};
const auto dest3_non_nullable = arrow::struct_(fields_dest3_non_nullable);
const auto options3_non_nullable = CastOptions::Safe(dest3_non_nullable);
Invalid,
::testing::HasSubstr(
"field 'a' of type int8 has nulls. Can't cast to non-nullable field 'a' "
"of type int64"),
Cast(src, CastOptions::Safe(arrow::struct_(fields_dest_ac))));

// if we only select a field with no nulls, it should be fine:
std::vector<std::shared_ptr<Field>> fields_dest_c = {fields_dest[2]};
std::vector<std::shared_ptr<Array>> arrays_dest_c = {
ArrayFromJSON(int64(), "[9, 11, 44]")};
ASSERT_OK_AND_ASSIGN(auto dest_c, StructArray::Make(arrays_dest_c, fields_dest_c));
CheckCast(src, dest_c);

// A slice that doesn't contain nulls is castable...
std::vector<std::shared_ptr<Array>> arrays_dest_0 = {
ArrayFromJSON(int64(), "[1]"),
ArrayFromJSON(int64(), "[3]"),
ArrayFromJSON(int64(), "[9]"),
};
ASSERT_OK_AND_ASSIGN(auto dest_0, StructArray::Make(arrays_dest_0, fields_dest));
CheckCast(src->Slice(0, 1), dest_0);

// ...but a slice that contains nulls will error.
EXPECT_RAISES_WITH_MESSAGE_THAT(
TypeError,
::testing::HasSubstr("cannot cast nullable field to non-nullable field"),
Cast(src_nullable, options3_non_nullable));
Invalid,
::testing::HasSubstr(
"field 'a' of type int8 has nulls. Can't cast to non-nullable field 'a' "
"of type int64"),
Cast(src->Slice(1, 3), CastOptions::Safe(arrow::struct_(fields_dest))));
}
}

Expand Down

0 comments on commit 6a47e4d

Please sign in to comment.