diff --git a/be/benchmark/benchmark_main.cpp b/be/benchmark/benchmark_main.cpp index 21e629e3d6f92b..0c01bf3ee6da25 100644 --- a/be/benchmark/benchmark_main.cpp +++ b/be/benchmark/benchmark_main.cpp @@ -25,6 +25,7 @@ #include "benchmark_hll_merge.hpp" #include "benchmark_hybrid_set.hpp" #include "benchmark_string.hpp" +#include "benchmark_string_replace.hpp" #include "binary_cast_benchmark.hpp" #include "core/block/block.h" #include "core/column/column_string.h" diff --git a/be/benchmark/benchmark_string_replace.hpp b/be/benchmark/benchmark_string_replace.hpp new file mode 100644 index 00000000000000..240898937ab20d --- /dev/null +++ b/be/benchmark/benchmark_string_replace.hpp @@ -0,0 +1,269 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// ============================================================ +// Benchmark: String Replace — old (per-row std::string) vs new (columnar) +// +// Measures the performance of string replace when needle and +// replacement are both constants. The "new" path uses a two-level +// search strategy: memchr (glibc AVX512) as a per-row first-byte +// pre-filter for fast no-match short-circuiting, then a prebuilt +// ASCIICaseSensitiveStringSearcher (SSE4.1) for full needle matching. +// Writes output directly to ColumnString chars/offsets, matching +// _replace_const_pattern() in FunctionReplace. +// ============================================================ + +#include + +#include +#include +#include +#include +#include + +#include "core/column/column_string.h" +#include "exec/common/string_searcher.h" + +namespace doris { + +// ---- Old implementation (current Doris: per-row std::string find/replace) ---- +static std::string replace_old(std::string str, std::string_view old_str, + std::string_view new_str) { + if (old_str.empty()) { + return str; + } + std::string::size_type pos = 0; + std::string::size_type old_len = old_str.size(); + std::string::size_type new_len = new_str.size(); + while ((pos = str.find(old_str, pos)) != std::string::npos) { + str.replace(pos, old_len, new_str); + pos += new_len; + } + return str; +} + +static void replace_old_column(const ColumnString& src, const std::string& needle, + const std::string& replacement, ColumnString& dst) { + size_t rows = src.size(); + for (size_t i = 0; i < rows; ++i) { + StringRef ref = src.get_data_at(i); + std::string result = replace_old(ref.to_string(), needle, replacement); + dst.insert_data(result.data(), result.size()); + } +} + +// ---- New implementation (columnar: memchr pre-filter + SSE4.1 searcher) ---- +// Matches the fast path in FunctionReplace::_replace_const_pattern(). +// Two-level search strategy: +// 1. memchr (glibc AVX512) for needle's first byte per row. +// If absent -> guaranteed no match -> single bulk memcpy, no SSE4.1 overhead. +// 2. ASCIICaseSensitiveStringSearcher (SSE4.1, built once) for full needle scan +// only on rows where the first byte was found. +// Writes output directly to ColumnString chars/offsets — no per-row std::string. +static void replace_new_column(const ColumnString& src, const std::string& needle, + const std::string& replacement, ColumnString& dst) { + auto& dst_chars = dst.get_chars(); + auto& dst_offsets = dst.get_offsets(); + size_t rows = src.size(); + + dst_chars.reserve(src.get_chars().size()); + dst_offsets.resize(rows); + + if (needle.empty()) { + dst_chars.insert(src.get_chars().begin(), src.get_chars().end()); + memcpy(dst_offsets.data(), src.get_offsets().data(), rows * sizeof(dst_offsets[0])); + return; + } + + // Build SSE4.1 searcher once — first+second byte masks precomputed here. + ASCIICaseSensitiveStringSearcher searcher(needle.data(), needle.size()); + const size_t needle_size = needle.size(); + const size_t replacement_size = replacement.size(); + const char* replacement_data = replacement.data(); + const auto needle_first = static_cast(needle[0]); + + for (size_t i = 0; i < rows; ++i) { + StringRef row = src.get_data_at(i); + const char* const row_end = row.data + row.size; + + // Level-1: memchr for needle's first byte (glibc AVX512). + // First byte absent -> no match possible -> bulk-copy entire row. + if (memchr(row.data, needle_first, row.size) == nullptr) { + size_t old_size = dst_chars.size(); + dst_chars.resize(old_size + row.size); + memcpy(&dst_chars[old_size], row.data, row.size); + dst_offsets[i] = static_cast(dst_chars.size()); + continue; + } + + // Level-2: SSE4.1 searcher for full needle matching. + const char* pos = row.data; + while (pos < row_end) { + const char* match = searcher.search(pos, row_end); + size_t prefix_len = static_cast(match - pos); + if (prefix_len > 0) { + size_t old_size = dst_chars.size(); + dst_chars.resize(old_size + prefix_len); + memcpy(&dst_chars[old_size], pos, prefix_len); + } + if (match == row_end) { + break; + } + if (replacement_size > 0) { + size_t old_size = dst_chars.size(); + dst_chars.resize(old_size + replacement_size); + memcpy(&dst_chars[old_size], replacement_data, replacement_size); + } + pos = match + needle_size; + } + dst_offsets[i] = static_cast(dst_chars.size()); + } +} + +// ---- Helper: build a ColumnString with random data containing the needle ---- +static ColumnString::MutablePtr build_test_column(size_t num_rows, size_t avg_len, + const std::string& needle, double hit_rate, + unsigned seed = 42) { + auto col = ColumnString::create(); + std::mt19937 rng(seed); + std::uniform_int_distribution char_dist('a', 'z'); + std::uniform_real_distribution hit_dist(0.0, 1.0); + std::uniform_int_distribution len_dist(avg_len / 2, avg_len * 3 / 2); + + for (size_t r = 0; r < num_rows; ++r) { + size_t len = len_dist(rng); + std::string s; + s.reserve(len + needle.size() * 3); + size_t written = 0; + while (written < len) { + if (!needle.empty() && hit_dist(rng) < hit_rate && written + needle.size() <= len) { + s += needle; + written += needle.size(); + } else { + s += static_cast(char_dist(rng)); + ++written; + } + } + col->insert_data(s.data(), s.size()); + } + return col; +} + +// -------- Benchmarks -------- + +// Small strings, high hit rate, many rows +static void BM_Replace_Old_SmallStr(benchmark::State& state) { + std::string needle = "abc"; + std::string replacement = "XY"; + auto src = build_test_column(10000, 50, needle, 0.1); + for (auto _ : state) { + auto dst = ColumnString::create(); + replace_old_column(*src, needle, replacement, *dst); + benchmark::DoNotOptimize(dst); + } +} + +static void BM_Replace_New_SmallStr(benchmark::State& state) { + std::string needle = "abc"; + std::string replacement = "XY"; + auto src = build_test_column(10000, 50, needle, 0.1); + for (auto _ : state) { + auto dst = ColumnString::create(); + replace_new_column(*src, needle, replacement, *dst); + benchmark::DoNotOptimize(dst); + } +} + +// Medium strings, moderate hit rate +static void BM_Replace_Old_MedStr(benchmark::State& state) { + std::string needle = "hello"; + std::string replacement = "world!"; + auto src = build_test_column(5000, 200, needle, 0.05); + for (auto _ : state) { + auto dst = ColumnString::create(); + replace_old_column(*src, needle, replacement, *dst); + benchmark::DoNotOptimize(dst); + } +} + +static void BM_Replace_New_MedStr(benchmark::State& state) { + std::string needle = "hello"; + std::string replacement = "world!"; + auto src = build_test_column(5000, 200, needle, 0.05); + for (auto _ : state) { + auto dst = ColumnString::create(); + replace_new_column(*src, needle, replacement, *dst); + benchmark::DoNotOptimize(dst); + } +} + +// Large strings, low hit rate +static void BM_Replace_Old_LargeStr(benchmark::State& state) { + std::string needle = "pattern"; + std::string replacement = "REPLACED"; + auto src = build_test_column(1000, 1000, needle, 0.02); + for (auto _ : state) { + auto dst = ColumnString::create(); + replace_old_column(*src, needle, replacement, *dst); + benchmark::DoNotOptimize(dst); + } +} + +static void BM_Replace_New_LargeStr(benchmark::State& state) { + std::string needle = "pattern"; + std::string replacement = "REPLACED"; + auto src = build_test_column(1000, 1000, needle, 0.02); + for (auto _ : state) { + auto dst = ColumnString::create(); + replace_new_column(*src, needle, replacement, *dst); + benchmark::DoNotOptimize(dst); + } +} + +// No matches (needle not present) — measures search overhead +static void BM_Replace_Old_NoMatch(benchmark::State& state) { + std::string needle = "ZZZZZZ"; + std::string replacement = "X"; + auto src = build_test_column(10000, 100, "abc", 0.0); // no ZZZZZZ in data + for (auto _ : state) { + auto dst = ColumnString::create(); + replace_old_column(*src, needle, replacement, *dst); + benchmark::DoNotOptimize(dst); + } +} + +static void BM_Replace_New_NoMatch(benchmark::State& state) { + std::string needle = "ZZZZZZ"; + std::string replacement = "X"; + auto src = build_test_column(10000, 100, "abc", 0.0); + for (auto _ : state) { + auto dst = ColumnString::create(); + replace_new_column(*src, needle, replacement, *dst); + benchmark::DoNotOptimize(dst); + } +} + +BENCHMARK(BM_Replace_Old_SmallStr); +BENCHMARK(BM_Replace_New_SmallStr); +BENCHMARK(BM_Replace_Old_MedStr); +BENCHMARK(BM_Replace_New_MedStr); +BENCHMARK(BM_Replace_Old_LargeStr); +BENCHMARK(BM_Replace_New_LargeStr); +BENCHMARK(BM_Replace_Old_NoMatch); +BENCHMARK(BM_Replace_New_NoMatch); + +} // namespace doris diff --git a/be/src/exprs/function/function_string_replace.h b/be/src/exprs/function/function_string_replace.h index bdd5647a9327b5..ab2dd382c96679 100644 --- a/be/src/exprs/function/function_string_replace.h +++ b/be/src/exprs/function/function_string_replace.h @@ -35,6 +35,7 @@ #include "core/data_type/data_type_number.h" #include "core/data_type/data_type_string.h" #include "core/string_ref.h" +#include "exec/common/string_searcher.h" #include "exec/common/stringop_substring.h" #include "exec/common/template_helpers.hpp" #include "exprs/function/function.h" @@ -87,6 +88,23 @@ class FunctionReplace : public IFunction { ColumnString::MutablePtr col_res = ColumnString::create(); + // Fast path: when old_str and new_str are both constant and old_str is + // non-empty (the common case for replace(col, 'literal', 'literal')). + // Works directly on ColumnString chars/offsets to avoid per-row + // std::string allocation and copy overhead. + // Applies to both replace (empty=true) and replace_empty (empty=false): + // when old_str is non-empty the two variants behave identically. + if (col_const[1] && col_const[2]) { + StringRef old_ref = col_old_str->get_data_at(0); + StringRef new_ref = col_new_str->get_data_at(0); + if (old_ref.size > 0) { + _replace_const_pattern(*col_origin_str, old_ref, new_ref, *col_res, + input_rows_count, col_const[0]); + block.replace_by_position(result, std::move(col_res)); + return Status::OK(); + } + } + std::visit( [&](auto origin_str_const, auto old_str_const, auto new_str_const) { for (int i = 0; i < input_rows_count; ++i) { @@ -112,6 +130,70 @@ class FunctionReplace : public IFunction { } private: + // Optimized replace path for constant old_str (non-empty) and constant new_str. + // Avoids per-row std::string allocation by working directly on ColumnString + // chars/offsets. Two-level search strategy: + // 1. memchr (glibc AVX512) scans for the needle's first byte. If absent, + // the row is guaranteed no-match and is bulk-copied with a single memcpy. + // 2. When the first byte is present, ASCIICaseSensitiveStringSearcher + // (SSE4.1, prebuilt once outside the row loop) does the full needle scan. + static void _replace_const_pattern(const ColumnString& src, StringRef old_ref, + StringRef new_ref, ColumnString& dst, + size_t input_rows_count, bool src_const) { + auto& dst_chars = dst.get_chars(); + auto& dst_offsets = dst.get_offsets(); + + dst_chars.reserve(src_const ? (src.get_data_at(0).size * input_rows_count) + : src.get_chars().size()); + dst_offsets.resize(input_rows_count); + + // Build SSE4.1 searcher once — first+second byte masks precomputed here. + ASCIICaseSensitiveStringSearcher searcher(old_ref.data, old_ref.size); + const size_t needle_size = old_ref.size; + const size_t replacement_size = new_ref.size; + const char* replacement_data = new_ref.data; + const auto needle_first = static_cast(old_ref.data[0]); + + for (size_t i = 0; i < input_rows_count; ++i) { + StringRef row = src.get_data_at(src_const ? 0 : i); + const char* const row_end = row.data + row.size; + + // Level-1: memchr for needle's first byte (glibc uses AVX512 internally). + // If the first byte is absent the entire row cannot contain the needle; + // bulk-copy it and move to the next row without entering the SSE4.1 loop. + if (memchr(row.data, needle_first, row.size) == nullptr) { + StringOP::push_value_string({row.data, row.size}, i, dst_chars, dst_offsets); + continue; + } + + // Level-2: SSE4.1 searcher handles needle matching for this row. + const char* pos = row.data; + while (pos < row_end) { + const char* match = searcher.search(pos, row_end); + // Copy prefix before match + size_t prefix_len = static_cast(match - pos); + if (prefix_len > 0) { + size_t old_size = dst_chars.size(); + ColumnString::check_chars_length(old_size + prefix_len, i + 1); + dst_chars.resize(old_size + prefix_len); + memcpy(&dst_chars[old_size], pos, prefix_len); + } + if (match == row_end) { + break; + } + // Copy replacement + if (replacement_size > 0) { + size_t old_size = dst_chars.size(); + ColumnString::check_chars_length(old_size + replacement_size, i + 1); + dst_chars.resize(old_size + replacement_size); + memcpy(&dst_chars[old_size], replacement_data, replacement_size); + } + pos = match + needle_size; + } + StringOP::push_empty_string(i, dst_chars, dst_offsets); + } + } + std::string replace(std::string str, std::string_view old_str, std::string_view new_str) const { if (old_str.empty()) { if constexpr (empty) {