From c510f78725e55ee89c0ce7e325dfdc5ff683a144 Mon Sep 17 00:00:00 2001 From: fenfeng9 Date: Fri, 12 Jun 2026 22:25:14 +0800 Subject: [PATCH 1/5] Fix all-inline binary view casts for C export --- .../compute/kernels/scalar_cast_string.cc | 14 ++-- .../arrow/compute/kernels/scalar_cast_test.cc | 69 +++++++++++++++++++ python/pyarrow/tests/test_cffi.py | 33 +++++++++ 3 files changed, 111 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index 4d0aa943ed90..52b79c8d452d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -438,8 +438,7 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou auto* out_views = output->GetMutableValues(1); - // If all entries are inline, we can drop the extra data buffer for - // large strings in output->buffers[2]. + // If all entries are inline, there are no variadic data buffers to expose. bool all_entries_are_inline = true; VisitSetBitRunsVoid( validity, output->offset, output->length, @@ -463,8 +462,9 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou } }); if (all_entries_are_inline) { - output->buffers[2] = nullptr; + output->buffers.resize(2); } + return Status::OK(); } @@ -508,11 +508,15 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou const int32_t fixed_size_width = input.type->byte_width(); const int64_t total_length = input.offset + input.length; + // Values of width <= BinaryViewType::kInlineSize are stored inline in the + // views, so the output only needs a variadic data buffer for larger widths. + const bool values_are_inline = fixed_size_width <= BinaryViewType::kInlineSize; + const size_t num_buffers = values_are_inline ? 2 : 3; ArrayData* output = out->array_data().get(); DCHECK_EQ(output->length, input.length); output->offset = input.offset; - output->buffers.resize(3); + output->buffers.resize(num_buffers); output->SetNullCount(input.null_count); // Share the validity bitmap buffer output->buffers[0] = input.GetBuffer(0); @@ -539,7 +543,7 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou } // Inline string and non-inline string loops - if (fixed_size_width <= BinaryViewType::kInlineSize) { + if (values_are_inline) { int32_t data_offset = static_cast(input.offset) * fixed_size_width; for (int64_t i = 0; i < input.length; i++) { auto& out_view = out_views[i]; diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 4ff58040e05e..76bcdc155401 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -3361,6 +3361,75 @@ TEST(Cast, BinaryOrStringToBinary) { } } +TEST(Cast, BinaryOrStringToView) { + // GH-49740: when all values were inline, the cast left a null variadic + // buffer slot behind and exporting to the C data interface crashed. + const std::vector> cases = { + // Empty inputs are handled before the cast kernel runs. + {"[]", 2}, + {R"(["a", null, "e"])", 2}, + {"[null, null]", 2}, + {R"(["aaaaaaaaaaaaa", null, "eeeeeeeeeeeee"])", 3}, + {R"(["a", null, "aaaaaaaaaaaaa"])", 3}, + }; + + for (auto from_type : {binary(), large_binary(), utf8(), large_utf8()}) { + for (auto to_type : {binary_view(), utf8_view()}) { + for (const auto& [json, expected_num_buffers] : cases) { + auto input = ArrayFromJSON(from_type, json); + auto expected = ArrayFromJSON(to_type, json); + + ASSERT_OK_AND_ASSIGN(auto casted, Cast(*input, to_type)); + ValidateOutput(*casted); + AssertArraysEqual(*expected, *casted); + + ASSERT_EQ(casted->data()->buffers.size(), expected_num_buffers) + << "from: " << from_type->ToString() << ", to: " << to_type->ToString() + << ", values: " << json; + if (expected_num_buffers == 3) { + ASSERT_NE(casted->data()->buffers[2], nullptr) + << "from: " << from_type->ToString() << ", to: " << to_type->ToString() + << ", values: " << json; + } + } + } + } +} + +TEST(Cast, FixedSizeBinaryToView) { + // Fixed-size binary uses a separate cast kernel with the same GH-49740 + // issue. Cover non-empty widths on both sides of the BinaryView inline limit. + const std::vector, std::string, int64_t>> cases = { + // Empty inputs are handled before the cast kernel runs. + {fixed_size_binary(1), "[]", 2}, + {fixed_size_binary(13), "[]", 2}, + {fixed_size_binary(1), R"(["a", null, "e"])", 2}, + {fixed_size_binary(1), "[null, null]", 2}, + {fixed_size_binary(12), R"(["aaaaaaaaaaaa", null])", 2}, + {fixed_size_binary(13), R"(["aaaaaaaaaaaaa", null, "eeeeeeeeeeeee"])", 3}, + }; + + for (const auto& [from_type, json, expected_num_buffers] : cases) { + for (auto to_type : {binary_view(), utf8_view()}) { + auto input = ArrayFromJSON(from_type, json); + auto expected = ArrayFromJSON(to_type, json); + + ASSERT_OK_AND_ASSIGN(auto casted, Cast(*input, to_type)); + ValidateOutput(*casted); + AssertArraysEqual(*expected, *casted); + + ASSERT_EQ(casted->data()->buffers.size(), expected_num_buffers) + << "from: " << from_type->ToString() << ", to: " << to_type->ToString() + << ", values: " << json; + if (expected_num_buffers == 3) { + ASSERT_NE(casted->data()->buffers[2], nullptr) + << "from: " << from_type->ToString() << ", to: " << to_type->ToString() + << ", values: " << json; + } + } + } +} + TEST(Cast, StringToString) { for (auto from_type : {utf8(), utf8_view(), large_utf8()}) { for (auto to_type : {utf8(), utf8_view(), large_utf8()}) { diff --git a/python/pyarrow/tests/test_cffi.py b/python/pyarrow/tests/test_cffi.py index 481c387d5337..0d9fd7245972 100644 --- a/python/pyarrow/tests/test_cffi.py +++ b/python/pyarrow/tests/test_cffi.py @@ -234,6 +234,39 @@ def test_export_import_array(): ) +@needs_cffi +def test_export_cast_binary_view_all_inline(): + # GH-49740: _export_to_c segmentation fault for binary_view array. + c_schema = ffi.new("struct ArrowSchema*") + ptr_schema = int(ffi.cast("uintptr_t", c_schema)) + c_array = ffi.new("struct ArrowArray*") + ptr_array = int(ffi.cast("uintptr_t", c_array)) + + gc.collect() # Make sure no Arrow data dangles in a ref cycle + old_allocated = pa.total_allocated_bytes() + + # The cast used to leave a null variadic buffer slot behind when all + # values were inline. + arr = pa.array([b"a", None, b"e"], type=pa.binary()).cast(pa.binary_view()) + arr.validate(full=True) + py_value = arr.to_pylist() + arr._export_to_c(ptr_array, ptr_schema) + + assert c_array.length == 3 + # validity, views and the appended variadic buffer sizes + assert c_array.n_buffers == 3 + + del arr + arr_new = pa.Array._import_from_c(ptr_array, ptr_schema) + assert arr_new.to_pylist() == py_value + assert arr_new.type == pa.binary_view() + del arr_new + assert pa.total_allocated_bytes() == old_allocated + # Now released + with assert_schema_released: + pa.Array._import_from_c(ptr_array, ptr_schema) + + @needs_cffi def test_export_import_device_array(): check_export_import_array( From c7ebd7738c125c79e25e6008572bf34a142d03a9 Mon Sep 17 00:00:00 2001 From: fenfeng9 Date: Fri, 12 Jun 2026 22:50:38 +0800 Subject: [PATCH 2/5] Validate non-null variadic buffers --- cpp/src/arrow/array/array_binary_test.cc | 8 ++++++++ cpp/src/arrow/array/validate.cc | 5 +++++ 2 files changed, 13 insertions(+) diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index 04391be0ac78..310111f90d9d 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -403,6 +403,14 @@ TEST(StringViewArray, Validate) { }), Ok()); + // Variadic buffer slots, when present, must contain real buffers. + EXPECT_THAT(MakeBinaryViewArray({nullptr}, + { + util::ToInlineBinaryView("hello"), + util::ToInlineBinaryView("world"), + }), + Raises(StatusCode::Invalid)); + // non-inline views are expected to reference only buffers managed by the array EXPECT_THAT( MakeBinaryViewArray( diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index c1d96375bf09..76b4afe542d2 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -504,6 +504,11 @@ struct ValidateArrayImpl { : *layout.variadic_spec; if (buffer == nullptr) { + if (layout.variadic_spec && i >= static_cast(layout.buffers.size())) { + return Status::Invalid("Array of type ", type.ToString(), + " has a null variadic buffer at buffer index ", i, + ", variadic buffer index ", i - layout.buffers.size()); + } continue; } int64_t min_buffer_size = 0; From ce914c1a3fd4a9ef43f8464ca7bad45004c5f83c Mon Sep 17 00:00:00 2001 From: fenfeng9 Date: Fri, 12 Jun 2026 23:13:09 +0800 Subject: [PATCH 3/5] Validate variadic buffers before C export --- cpp/src/arrow/array/validate.cc | 5 +++-- cpp/src/arrow/c/bridge.cc | 5 +++++ cpp/src/arrow/c/bridge_test.cc | 14 ++++++++++++++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 76b4afe542d2..16bc9187af46 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -506,8 +506,9 @@ struct ValidateArrayImpl { if (buffer == nullptr) { if (layout.variadic_spec && i >= static_cast(layout.buffers.size())) { return Status::Invalid("Array of type ", type.ToString(), - " has a null variadic buffer at buffer index ", i, - ", variadic buffer index ", i - layout.buffers.size()); + " has a null variadic buffer at buffer index #", i, + " (variadic buffer index #", i - layout.buffers.size(), + ")"); } continue; } diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 82e74098d248..4391d4cbc248 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -608,6 +608,11 @@ struct ArrayExporter { export_.variadic_buffer_sizes_.resize(variadic_buffers.size()); size_t i = 0; for (const auto& buf : variadic_buffers) { + if (buf == nullptr) { + return Status::Invalid("Cannot export array of type ", data->type->ToString(), + ": null variadic buffer at buffer index #", i + 2, + " (variadic buffer index #", i, ")"); + } export_.variadic_buffer_sizes_[i++] = buf->size(); } export_.buffers_.back() = export_.variadic_buffer_sizes_.data(); diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index cb204806f90c..cc42cba180da 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -930,6 +930,20 @@ TEST_F(TestArrayExport, PrimitiveSliced) { TestPrimitive(factory); } +TEST_F(TestArrayExport, RejectNullVariadicBuffers) { + auto arr = std::make_shared( + ArrayData::Make(binary_view(), /*length=*/2, + {nullptr, + Buffer::FromVector(std::vector{ + util::ToInlineBinaryView("hello"), + util::ToInlineBinaryView("world"), + }), + nullptr})); + + struct ArrowArray c_export; + ASSERT_RAISES(Invalid, ExportArray(*arr, &c_export)); +} + constexpr std::string_view binary_view_buffer_content0 = "12345foo bar baz quux", binary_view_buffer_content1 = "BinaryViewMultipleBuffers"; From 4dc1e3b057bc6ca3e154bbf19bfa14ca308c7cad Mon Sep 17 00:00:00 2001 From: fenfeng9 Date: Fri, 12 Jun 2026 23:28:25 +0800 Subject: [PATCH 4/5] Test C export guard for string_view --- cpp/src/arrow/c/bridge_test.cc | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index cc42cba180da..9b4090a8bfa3 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -931,17 +931,19 @@ TEST_F(TestArrayExport, PrimitiveSliced) { } TEST_F(TestArrayExport, RejectNullVariadicBuffers) { - auto arr = std::make_shared( - ArrayData::Make(binary_view(), /*length=*/2, - {nullptr, - Buffer::FromVector(std::vector{ - util::ToInlineBinaryView("hello"), - util::ToInlineBinaryView("world"), - }), - nullptr})); - - struct ArrowArray c_export; - ASSERT_RAISES(Invalid, ExportArray(*arr, &c_export)); + for (const auto& type : {binary_view(), utf8_view()}) { + auto arr = + MakeArray(ArrayData::Make(type, /*length=*/2, + {nullptr, + Buffer::FromVector(std::vector{ + util::ToInlineBinaryView("hello"), + util::ToInlineBinaryView("world"), + }), + nullptr})); + + struct ArrowArray c_export; + ASSERT_RAISES(Invalid, ExportArray(*arr, &c_export)); + } } constexpr std::string_view binary_view_buffer_content0 = "12345foo bar baz quux", From 9c8b26870f14a0b7c8ff0b34bb8719be98896116 Mon Sep 17 00:00:00 2001 From: fenfeng9 Date: Sat, 13 Jun 2026 00:29:19 +0800 Subject: [PATCH 5/5] update cast tests for inline view buffers --- .../arrow/compute/kernels/scalar_cast_test.cc | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 76bcdc155401..51e6ca534cd5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -3307,9 +3307,12 @@ TEST(Cast, BinaryToString) { // N.B. null buffer is not always the same if input sliced AssertBufferSame(*invalid_utf8, *strings, 0); - // ARROW-16757: we no longer zero copy, but the contents are equal - ASSERT_NE(invalid_utf8->data()->buffers[1].get(), strings->data()->buffers[2].get()); - if (!is_binary_view_like(*string_type)) { + if (is_binary_view_like(*string_type)) { + ASSERT_EQ(strings->data()->buffers.size(), 2); + } else { + // ARROW-16757: we no longer zero copy, but the contents are equal + ASSERT_NE(invalid_utf8->data()->buffers[1].get(), + strings->data()->buffers[2].get()); ASSERT_TRUE(invalid_utf8->data()->buffers[1]->Equals(*strings->data()->buffers[2])); } } @@ -3349,9 +3352,12 @@ TEST(Cast, BinaryOrStringToBinary) { // N.B. null buffer is not always the same if input sliced AssertBufferSame(*invalid_utf8, *strings, 0); - // ARROW-16757: we no longer zero copy, but the contents are equal - ASSERT_NE(invalid_utf8->data()->buffers[1].get(), strings->data()->buffers[2].get()); - if (!is_binary_view_like(*to_type)) { + if (is_binary_view_like(*to_type)) { + ASSERT_EQ(strings->data()->buffers.size(), 2); + } else { + // ARROW-16757: we no longer zero copy, but the contents are equal + ASSERT_NE(invalid_utf8->data()->buffers[1].get(), + strings->data()->buffers[2].get()); ASSERT_TRUE(invalid_utf8->data()->buffers[1]->Equals(*strings->data()->buffers[2])); }