Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions be/src/core/column/column_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include "util/simd/bits.h"
#include "util/simd/vstring_function.h"
#include "util/unaligned.h"
#include "util/utf8_check.h"
namespace doris {

template <typename T>
Expand Down Expand Up @@ -759,6 +760,20 @@ bool ColumnStr<T>::is_ascii() const {
return simd::VStringFunctions::is_ascii(StringRef(chars.data(), chars.size()));
}

template <typename T>
bool ColumnStr<T>::is_valid_utf8() const {
const auto num_rows = offsets.size();
const char* data = reinterpret_cast<const char*>(chars.data());
for (size_t i = 0; i < num_rows; ++i) {
auto str_offset = offset_at(i);
auto str_size = size_at(i);
if (!validate_utf8(data + str_offset, str_size)) {
return false;
}
}
return true;
}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

stores row payloads contiguously without any delimiter, so UTF-8 validity is not compositional here. Validating the whole buffer can return even when an individual row is invalid. A concrete case is rows "\xE4" and "\xB8\x96": each row is invalid by itself, but the concatenated buffer forms the valid UTF-8 sequence for , so this helper would incorrectly accept the column. Since this API is introduced as column-level UTF-8 validation, it needs to walk and validate each row independently.


template class ColumnStr<uint32_t>;
template class ColumnStr<uint64_t>;
} // namespace doris
4 changes: 4 additions & 0 deletions be/src/core/column/column_string.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ class Arena;
class ColumnSorter;

/** Column for String values.
* Note: In string functions, we assume that ColumnStr contains valid UTF-8 encoded data.
* However, ColumnStr is not guaranteed to always hold valid UTF-8, since it is also used
* as a serialization container where the content may be arbitrary binary data.
*/
template <typename T>
class ColumnStr final : public COWHelper<IColumn, ColumnStr<T>> {
Expand Down Expand Up @@ -536,6 +539,7 @@ class ColumnStr final : public COWHelper<IColumn, ColumnStr<T>> {
}

bool is_ascii() const;
bool is_valid_utf8() const;

Chars& get_chars() { return chars; }
const Chars& get_chars() const { return chars; }
Expand Down
27 changes: 27 additions & 0 deletions be/src/exprs/function/function_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
#include "exprs/function/string_hex_util.h"
#include "util/string_search.hpp"
#include "util/url_coding.h"
#include "util/utf8_check.h"

namespace doris {
struct NameStringASCII {
Expand Down Expand Up @@ -225,6 +226,29 @@ struct StringUtf8LengthImpl {
}
};

struct NameIsValidUTF8 {
static constexpr auto name = "is_valid_utf8";
};

struct IsValidUTF8Impl {
using ReturnType = DataTypeUInt8;
static constexpr auto PrimitiveTypeImpl = PrimitiveType::TYPE_STRING;
using Type = String;
using ReturnColumnType = ColumnUInt8;

static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets,
PaddedPODArray<UInt8>& res) {
auto size = offsets.size();
res.resize(size);
for (size_t i = 0; i < size; ++i) {
const char* raw_str = reinterpret_cast<const char*>(&data[offsets[i - 1]]);
size_t str_size = offsets[i] - offsets[i - 1];
res[i] = validate_utf8(raw_str, str_size) ? 1 : 0;
}
return Status::OK();
}
};

struct NameStartsWith {
static constexpr auto name = "starts_with";
};
Expand Down Expand Up @@ -1316,6 +1340,7 @@ using FunctionStringLength = FunctionUnaryToType<StringLengthImpl, NameStringLen
using FunctionCrc32 = FunctionUnaryToType<Crc32Impl, NameCrc32>;
using FunctionStringUTF8Length = FunctionUnaryToType<StringUtf8LengthImpl, NameStringUtf8Length>;
using FunctionStringSpace = FunctionUnaryToType<StringSpace, NameStringSpace>;
using FunctionIsValidUTF8 = FunctionUnaryToType<IsValidUTF8Impl, NameIsValidUTF8>;
using FunctionStringStartsWith =
FunctionBinaryToType<DataTypeString, DataTypeString, StringStartsWithImpl, NameStartsWith>;
using FunctionStringEndsWith =
Expand Down Expand Up @@ -1422,7 +1447,9 @@ void register_function_string(SimpleFunctionFactory& factory) {
factory.register_function<FunctionSubReplace<SubReplaceThreeImpl>>();
factory.register_function<FunctionSubReplace<SubReplaceFourImpl>>();
factory.register_function<FunctionOverlay>();
factory.register_function<FunctionIsValidUTF8>();

factory.register_alias(FunctionIsValidUTF8::name, "isValidUTF8");
factory.register_alias(FunctionToLower::name, "lcase");
factory.register_alias(FunctionToUpper::name, "ucase");
factory.register_alias(FunctionStringUTF8Length::name, "character_length");
Expand Down
85 changes: 85 additions & 0 deletions be/test/core/column/column_string_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1425,4 +1425,89 @@ TEST_F(ColumnStringTest, is_ascii) {
}
}

TEST_F(ColumnStringTest, is_valid_utf8) {
// all ASCII strings are valid UTF-8
{
auto column = ColumnString::create();
column->insert_data("hello", 5);
column->insert_data("world", 5);
column->insert_data("123!@#", 6);
EXPECT_TRUE(column->is_valid_utf8());
}
// empty column is valid
{
auto column = ColumnString::create();
EXPECT_TRUE(column->is_valid_utf8());
}
// empty strings are valid UTF-8
{
auto column = ColumnString::create();
column->insert_data("", 0);
column->insert_data("", 0);
EXPECT_TRUE(column->is_valid_utf8());
}
// multi-byte UTF-8 characters
{
auto column = ColumnString::create();
column->insert_data("Hello, 世界", strlen("Hello, 世界"));
column->insert_data("こんにちは", strlen("こんにちは"));
column->insert_data("😀", strlen("😀"));
EXPECT_TRUE(column->is_valid_utf8());
}
// invalid: lone continuation byte 0x80
{
auto column = ColumnString::create();
const char data[] = {'\x80'};
column->insert_data(data, 1);
EXPECT_FALSE(column->is_valid_utf8());
}
// invalid: bad 2-byte sequence 0xC3 0x28
{
auto column = ColumnString::create();
const char data[] = {'\xc3', '\x28'};
column->insert_data(data, 2);
EXPECT_FALSE(column->is_valid_utf8());
}
// invalid: overlong encoding 0xC0 0xAF
{
auto column = ColumnString::create();
const char data[] = {'\xc0', '\xaf'};
column->insert_data(data, 2);
EXPECT_FALSE(column->is_valid_utf8());
}
// invalid: 0xFE byte
{
auto column = ColumnString::create();
const char data[] = {'\xfe'};
column->insert_data(data, 1);
EXPECT_FALSE(column->is_valid_utf8());
}
// invalid: truncated 3-byte sequence 0xE4 0xB8
{
auto column = ColumnString::create();
const char data[] = {'\xe4', '\xb8'};
column->insert_data(data, 2);
EXPECT_FALSE(column->is_valid_utf8());
}
// mixed: one invalid byte makes the whole column invalid
{
auto column = ColumnString::create();
column->insert_data("hello", 5);
const char bad[] = {'\xff'};
column->insert_data(bad, 1);
column->insert_data("world", 5);
EXPECT_FALSE(column->is_valid_utf8());
}
// cross-row concatenation: "\xE4" + "\xB8\x96" form valid UTF-8 (世) when
// concatenated, but each row is invalid individually. Must validate per-row.
{
auto column = ColumnString::create();
const char row1[] = {'\xe4'};
const char row2[] = {'\xb8', '\x96'};
column->insert_data(row1, 1);
column->insert_data(row2, 2);
EXPECT_FALSE(column->is_valid_utf8());
}
}

} // namespace doris
30 changes: 30 additions & 0 deletions be/test/exprs/function/function_string_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -994,6 +994,36 @@ TEST(function_string_test, function_ascii_test) {
check_function_all_arg_comb<DataTypeInt32, true>(func_name, input_types, data_set);
}

TEST(function_string_test, function_is_valid_utf8_test) {
std::string func_name = "is_valid_utf8";

InputTypeSet input_types = {PrimitiveType::TYPE_VARCHAR};

DataSet data_set = {
// valid UTF-8 strings
{{std::string("hello")}, std::uint8_t(1)},
{{std::string("")}, std::uint8_t(1)},
{{std::string("Hello, 世界")}, std::uint8_t(1)},
{{std::string("こんにちは")}, std::uint8_t(1)},
{{std::string("123!@#")}, std::uint8_t(1)},
{{std::string("\xc3\xb1")}, std::uint8_t(1)}, // ñ
{{std::string("\xe2\x82\xac")}, std::uint8_t(1)}, // €
{{std::string("\xf0\x9f\x98\x80")}, std::uint8_t(1)}, // 😀
// invalid UTF-8 strings
{{std::string("\x80")}, std::uint8_t(0)}, // invalid leading byte
{{std::string("\xc3\x28")}, std::uint8_t(0)}, // invalid 2-byte sequence
{{std::string("\xe2\x28\xa1")}, std::uint8_t(0)}, // invalid 3-byte sequence
{{std::string("\xf0\x28\x8c\xbc")}, std::uint8_t(0)}, // invalid 4-byte sequence
{{std::string("\xfe")}, std::uint8_t(0)}, // invalid byte 0xFE
{{std::string("\xff")}, std::uint8_t(0)}, // invalid byte 0xFF
{{std::string("abc\xc0\xaf")}, std::uint8_t(0)}, // overlong encoding
// NULL
{{Null()}, Null()},
};

check_function_all_arg_comb<DataTypeUInt8, true>(func_name, input_types, data_set);
}

TEST(function_string_test, function_char_length_test) {
std::string func_name = "char_length";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsIpv6String;
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsNan;
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsUuid;
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsValidUtf8;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonArray;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonArrayIgnoreNull;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonContains;
Expand Down Expand Up @@ -837,6 +838,7 @@ public class BuiltinScalarFunctions implements FunctionHelper {
scalar(IsIpAddressInRange.class, "is_ip_address_in_range"),
scalar(IsNan.class, "isnan"),
scalar(IsUuid.class, "is_uuid"),
scalar(IsValidUtf8.class, "is_valid_utf8", "isValidUTF8"),
scalar(IsInf.class, "isinf"),
scalar(Ipv4CIDRToRange.class, "ipv4_cidr_to_range"),
scalar(Ipv6CIDRToRange.class, "ipv6_cidr_to_range"),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.doris.nereids.trees.expressions.functions.scalar;

import org.apache.doris.catalog.FunctionSignature;
import org.apache.doris.nereids.trees.expressions.Expression;
import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression;
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
import org.apache.doris.nereids.types.BooleanType;
import org.apache.doris.nereids.types.StringType;
import org.apache.doris.nereids.types.VarcharType;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;

import java.util.List;

/**
* ScalarFunction 'is_valid_utf8'.
*/
public class IsValidUtf8 extends ScalarFunction
implements UnaryExpression, ExplicitlyCastableSignature, PropagateNullable {

public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
FunctionSignature.ret(BooleanType.INSTANCE).args(VarcharType.SYSTEM_DEFAULT),
FunctionSignature.ret(BooleanType.INSTANCE).args(StringType.INSTANCE)
);

/**
* constructor with 1 argument.
*/
public IsValidUtf8(Expression arg) {
super("is_valid_utf8", arg);
}

/** constructor for withChildren and reuse signature */
private IsValidUtf8(ScalarFunctionParams functionParams) {
super(functionParams);
}

/**
* withChildren.
*/
@Override
public IsValidUtf8 withChildren(List<Expression> children) {
Preconditions.checkArgument(children.size() == 1);
return new IsValidUtf8(getFunctionParams(children));
}

@Override
public List<FunctionSignature> getSignatures() {
return SIGNATURES;
}

@Override
public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
return visitor.visitIsValidUtf8(this, context);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsIpv6String;
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsNan;
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsUuid;
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsValidUtf8;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonArray;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonArrayIgnoreNull;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonContains;
Expand Down Expand Up @@ -1701,6 +1702,10 @@ default R visitIsUuid(IsUuid isUuid, C context) {
return visitScalarFunction(isUuid, context);
}

default R visitIsValidUtf8(IsValidUtf8 isValidUtf8, C context) {
return visitScalarFunction(isValidUtf8, context);
}

default R visitIsInf(IsInf isInf, C context) {
return visitScalarFunction(isInf, context);
}
Expand Down
Loading
Loading