Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions cpp/src/arrow/array/array_binary_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,14 @@ TEST(StringViewArray, Validate) {
}),
Ok());

// Variadic buffer slots, when present, must contain real buffers.
EXPECT_THAT(MakeBinaryViewArray({nullptr},
{
util::ToInlineBinaryView("hello"),
util::ToInlineBinaryView("world"),
}),
Raises(StatusCode::Invalid));

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we test the error message somehow?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I'll update the tests later.


// non-inline views are expected to reference only buffers managed by the array
EXPECT_THAT(
MakeBinaryViewArray(
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/array/validate.cc
Original file line number Diff line number Diff line change
Expand Up @@ -504,6 +504,12 @@ struct ValidateArrayImpl {
: *layout.variadic_spec;

if (buffer == nullptr) {
if (layout.variadic_spec && i >= static_cast<int>(layout.buffers.size())) {
return Status::Invalid("Array of type ", type.ToString(),
" has a null variadic buffer at buffer index #", i,
" (variadic buffer index #", i - layout.buffers.size(),
")");
}
continue;
}
int64_t min_buffer_size = 0;
Expand Down
5 changes: 5 additions & 0 deletions cpp/src/arrow/c/bridge.cc
Original file line number Diff line number Diff line change
Expand Up @@ -608,6 +608,11 @@ struct ArrayExporter {
export_.variadic_buffer_sizes_.resize(variadic_buffers.size());
size_t i = 0;
for (const auto& buf : variadic_buffers) {
if (buf == nullptr) {
return Status::Invalid("Cannot export array of type ", data->type->ToString(),
": null variadic buffer at buffer index #", i + 2,
" (variadic buffer index #", i, ")");
}
export_.variadic_buffer_sizes_[i++] = buf->size();
}
export_.buffers_.back() = export_.variadic_buffer_sizes_.data();
Expand Down
16 changes: 16 additions & 0 deletions cpp/src/arrow/c/bridge_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -930,6 +930,22 @@ TEST_F(TestArrayExport, PrimitiveSliced) {
TestPrimitive(factory);
}

TEST_F(TestArrayExport, RejectNullVariadicBuffers) {
for (const auto& type : {binary_view(), utf8_view()}) {
auto arr =
MakeArray(ArrayData::Make(type, /*length=*/2,
{nullptr,
Buffer::FromVector(std::vector<BinaryViewType::c_type>{
util::ToInlineBinaryView("hello"),
util::ToInlineBinaryView("world"),
}),
nullptr}));

struct ArrowArray c_export;
ASSERT_RAISES(Invalid, ExportArray(*arr, &c_export));

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we test the error message too?

}
}

constexpr std::string_view binary_view_buffer_content0 = "12345foo bar baz quux",
binary_view_buffer_content1 = "BinaryViewMultipleBuffers";

Expand Down
14 changes: 9 additions & 5 deletions cpp/src/arrow/compute/kernels/scalar_cast_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -438,8 +438,7 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou

auto* out_views = output->GetMutableValues<BinaryViewType::c_type>(1);

// If all entries are inline, we can drop the extra data buffer for
// large strings in output->buffers[2].
// If all entries are inline, there are no variadic data buffers to expose.
bool all_entries_are_inline = true;
VisitSetBitRunsVoid(
validity, output->offset, output->length,
Expand All @@ -463,8 +462,9 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou
}
});
if (all_entries_are_inline) {
output->buffers[2] = nullptr;
output->buffers.resize(2);
}

return Status::OK();
}

Expand Down Expand Up @@ -508,11 +508,15 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou

const int32_t fixed_size_width = input.type->byte_width();
const int64_t total_length = input.offset + input.length;
// Values of width <= BinaryViewType::kInlineSize are stored inline in the
// views, so the output only needs a variadic data buffer for larger widths.
const bool values_are_inline = fixed_size_width <= BinaryViewType::kInlineSize;
const size_t num_buffers = values_are_inline ? 2 : 3;

ArrayData* output = out->array_data().get();
DCHECK_EQ(output->length, input.length);
output->offset = input.offset;
output->buffers.resize(3);
output->buffers.resize(num_buffers);
output->SetNullCount(input.null_count);
// Share the validity bitmap buffer
output->buffers[0] = input.GetBuffer(0);
Expand All @@ -539,7 +543,7 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou
}

// Inline string and non-inline string loops
if (fixed_size_width <= BinaryViewType::kInlineSize) {
if (values_are_inline) {
int32_t data_offset = static_cast<int32_t>(input.offset) * fixed_size_width;
for (int64_t i = 0; i < input.length; i++) {
auto& out_view = out_views[i];
Expand Down
87 changes: 81 additions & 6 deletions cpp/src/arrow/compute/kernels/scalar_cast_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3307,9 +3307,12 @@ TEST(Cast, BinaryToString) {
// N.B. null buffer is not always the same if input sliced
AssertBufferSame(*invalid_utf8, *strings, 0);

// ARROW-16757: we no longer zero copy, but the contents are equal
ASSERT_NE(invalid_utf8->data()->buffers[1].get(), strings->data()->buffers[2].get());
if (!is_binary_view_like(*string_type)) {
if (is_binary_view_like(*string_type)) {
ASSERT_EQ(strings->data()->buffers.size(), 2);
} else {
// ARROW-16757: we no longer zero copy, but the contents are equal
ASSERT_NE(invalid_utf8->data()->buffers[1].get(),
strings->data()->buffers[2].get());
ASSERT_TRUE(invalid_utf8->data()->buffers[1]->Equals(*strings->data()->buffers[2]));
}
}
Expand Down Expand Up @@ -3349,9 +3352,12 @@ TEST(Cast, BinaryOrStringToBinary) {
// N.B. null buffer is not always the same if input sliced
AssertBufferSame(*invalid_utf8, *strings, 0);

// ARROW-16757: we no longer zero copy, but the contents are equal
ASSERT_NE(invalid_utf8->data()->buffers[1].get(), strings->data()->buffers[2].get());
if (!is_binary_view_like(*to_type)) {
if (is_binary_view_like(*to_type)) {
ASSERT_EQ(strings->data()->buffers.size(), 2);
} else {
// ARROW-16757: we no longer zero copy, but the contents are equal
ASSERT_NE(invalid_utf8->data()->buffers[1].get(),
strings->data()->buffers[2].get());
ASSERT_TRUE(invalid_utf8->data()->buffers[1]->Equals(*strings->data()->buffers[2]));
}

Expand All @@ -3361,6 +3367,75 @@ TEST(Cast, BinaryOrStringToBinary) {
}
}

TEST(Cast, BinaryOrStringToView) {
// GH-49740: when all values were inline, the cast left a null variadic
// buffer slot behind and exporting to the C data interface crashed.
const std::vector<std::pair<std::string, int64_t>> cases = {
// Empty inputs are handled before the cast kernel runs.
{"[]", 2},
{R"(["a", null, "e"])", 2},
{"[null, null]", 2},
{R"(["aaaaaaaaaaaaa", null, "eeeeeeeeeeeee"])", 3},
{R"(["a", null, "aaaaaaaaaaaaa"])", 3},
};

for (auto from_type : {binary(), large_binary(), utf8(), large_utf8()}) {
for (auto to_type : {binary_view(), utf8_view()}) {
for (const auto& [json, expected_num_buffers] : cases) {
auto input = ArrayFromJSON(from_type, json);
auto expected = ArrayFromJSON(to_type, json);

ASSERT_OK_AND_ASSIGN(auto casted, Cast(*input, to_type));
ValidateOutput(*casted);
AssertArraysEqual(*expected, *casted);

ASSERT_EQ(casted->data()->buffers.size(), expected_num_buffers)
<< "from: " << from_type->ToString() << ", to: " << to_type->ToString()
<< ", values: " << json;
if (expected_num_buffers == 3) {
ASSERT_NE(casted->data()->buffers[2], nullptr)
<< "from: " << from_type->ToString() << ", to: " << to_type->ToString()
<< ", values: " << json;
}
}
}
}
}

TEST(Cast, FixedSizeBinaryToView) {
// Fixed-size binary uses a separate cast kernel with the same GH-49740
// issue. Cover non-empty widths on both sides of the BinaryView inline limit.
const std::vector<std::tuple<std::shared_ptr<DataType>, std::string, int64_t>> cases = {
// Empty inputs are handled before the cast kernel runs.
{fixed_size_binary(1), "[]", 2},
{fixed_size_binary(13), "[]", 2},
{fixed_size_binary(1), R"(["a", null, "e"])", 2},
{fixed_size_binary(1), "[null, null]", 2},
{fixed_size_binary(12), R"(["aaaaaaaaaaaa", null])", 2},
{fixed_size_binary(13), R"(["aaaaaaaaaaaaa", null, "eeeeeeeeeeeee"])", 3},
};

for (const auto& [from_type, json, expected_num_buffers] : cases) {
for (auto to_type : {binary_view(), utf8_view()}) {
auto input = ArrayFromJSON(from_type, json);
auto expected = ArrayFromJSON(to_type, json);

ASSERT_OK_AND_ASSIGN(auto casted, Cast(*input, to_type));
ValidateOutput(*casted);
AssertArraysEqual(*expected, *casted);

ASSERT_EQ(casted->data()->buffers.size(), expected_num_buffers)
<< "from: " << from_type->ToString() << ", to: " << to_type->ToString()
<< ", values: " << json;
if (expected_num_buffers == 3) {
ASSERT_NE(casted->data()->buffers[2], nullptr)
<< "from: " << from_type->ToString() << ", to: " << to_type->ToString()
<< ", values: " << json;
}
}
}
}

TEST(Cast, StringToString) {
for (auto from_type : {utf8(), utf8_view(), large_utf8()}) {
for (auto to_type : {utf8(), utf8_view(), large_utf8()}) {
Expand Down
33 changes: 33 additions & 0 deletions python/pyarrow/tests/test_cffi.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,39 @@ def test_export_import_array():
)


@needs_cffi
def test_export_cast_binary_view_all_inline():
# GH-49740: _export_to_c segmentation fault for binary_view array.
c_schema = ffi.new("struct ArrowSchema*")
ptr_schema = int(ffi.cast("uintptr_t", c_schema))
c_array = ffi.new("struct ArrowArray*")
ptr_array = int(ffi.cast("uintptr_t", c_array))

gc.collect() # Make sure no Arrow data dangles in a ref cycle
old_allocated = pa.total_allocated_bytes()

# The cast used to leave a null variadic buffer slot behind when all
# values were inline.
arr = pa.array([b"a", None, b"e"], type=pa.binary()).cast(pa.binary_view())
arr.validate(full=True)
py_value = arr.to_pylist()
arr._export_to_c(ptr_array, ptr_schema)

assert c_array.length == 3
# validity, views and the appended variadic buffer sizes
assert c_array.n_buffers == 3

del arr
arr_new = pa.Array._import_from_c(ptr_array, ptr_schema)
assert arr_new.to_pylist() == py_value
assert arr_new.type == pa.binary_view()
del arr_new
assert pa.total_allocated_bytes() == old_allocated
# Now released
with assert_schema_released:
pa.Array._import_from_c(ptr_array, ptr_schema)


@needs_cffi
def test_export_import_device_array():
check_export_import_array(
Expand Down
Loading