Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions be/benchmark/benchmark_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "benchmark_hll_merge.hpp"
#include "benchmark_hybrid_set.hpp"
#include "benchmark_string.hpp"
#include "benchmark_string_replace.hpp"
#include "binary_cast_benchmark.hpp"
#include "core/block/block.h"
#include "core/column/column_string.h"
Expand Down
269 changes: 269 additions & 0 deletions be/benchmark/benchmark_string_replace.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,269 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

// ============================================================
// Benchmark: String Replace — old (per-row std::string) vs new (columnar)
//
// Measures the performance of string replace when needle and
// replacement are both constants. The "new" path uses a two-level
// search strategy: memchr (glibc AVX512) as a per-row first-byte
// pre-filter for fast no-match short-circuiting, then a prebuilt
// ASCIICaseSensitiveStringSearcher (SSE4.1) for full needle matching.
// Writes output directly to ColumnString chars/offsets, matching
// _replace_const_pattern() in FunctionReplace.
// ============================================================

#include <benchmark/benchmark.h>

#include <cstring>
#include <random>
#include <string>
#include <string_view>
#include <vector>

#include "core/column/column_string.h"
#include "exec/common/string_searcher.h"

namespace doris {

// ---- Old implementation (current Doris: per-row std::string find/replace) ----
static std::string replace_old(std::string str, std::string_view old_str,
std::string_view new_str) {
if (old_str.empty()) {
return str;
}
std::string::size_type pos = 0;
std::string::size_type old_len = old_str.size();
std::string::size_type new_len = new_str.size();
while ((pos = str.find(old_str, pos)) != std::string::npos) {
str.replace(pos, old_len, new_str);
pos += new_len;
}
return str;
}

static void replace_old_column(const ColumnString& src, const std::string& needle,
const std::string& replacement, ColumnString& dst) {
size_t rows = src.size();
for (size_t i = 0; i < rows; ++i) {
StringRef ref = src.get_data_at(i);
std::string result = replace_old(ref.to_string(), needle, replacement);
dst.insert_data(result.data(), result.size());
}
}

// ---- New implementation (columnar: memchr pre-filter + SSE4.1 searcher) ----
// Matches the fast path in FunctionReplace::_replace_const_pattern().
// Two-level search strategy:
// 1. memchr (glibc AVX512) for needle's first byte per row.
// If absent -> guaranteed no match -> single bulk memcpy, no SSE4.1 overhead.
// 2. ASCIICaseSensitiveStringSearcher (SSE4.1, built once) for full needle scan
// only on rows where the first byte was found.
// Writes output directly to ColumnString chars/offsets — no per-row std::string.
static void replace_new_column(const ColumnString& src, const std::string& needle,
const std::string& replacement, ColumnString& dst) {
auto& dst_chars = dst.get_chars();
auto& dst_offsets = dst.get_offsets();
size_t rows = src.size();

dst_chars.reserve(src.get_chars().size());
dst_offsets.resize(rows);

if (needle.empty()) {
dst_chars.insert(src.get_chars().begin(), src.get_chars().end());
memcpy(dst_offsets.data(), src.get_offsets().data(), rows * sizeof(dst_offsets[0]));
return;
}

// Build SSE4.1 searcher once — first+second byte masks precomputed here.
ASCIICaseSensitiveStringSearcher searcher(needle.data(), needle.size());
const size_t needle_size = needle.size();
const size_t replacement_size = replacement.size();
const char* replacement_data = replacement.data();
const auto needle_first = static_cast<unsigned char>(needle[0]);

for (size_t i = 0; i < rows; ++i) {
StringRef row = src.get_data_at(i);
const char* const row_end = row.data + row.size;

// Level-1: memchr for needle's first byte (glibc AVX512).
// First byte absent -> no match possible -> bulk-copy entire row.
if (memchr(row.data, needle_first, row.size) == nullptr) {
size_t old_size = dst_chars.size();
dst_chars.resize(old_size + row.size);
memcpy(&dst_chars[old_size], row.data, row.size);
dst_offsets[i] = static_cast<ColumnString::Offset>(dst_chars.size());
continue;
}

// Level-2: SSE4.1 searcher for full needle matching.
const char* pos = row.data;
while (pos < row_end) {
const char* match = searcher.search(pos, row_end);
size_t prefix_len = static_cast<size_t>(match - pos);
if (prefix_len > 0) {
size_t old_size = dst_chars.size();
dst_chars.resize(old_size + prefix_len);
memcpy(&dst_chars[old_size], pos, prefix_len);
}
if (match == row_end) {
break;
}
if (replacement_size > 0) {
size_t old_size = dst_chars.size();
dst_chars.resize(old_size + replacement_size);
memcpy(&dst_chars[old_size], replacement_data, replacement_size);
}
pos = match + needle_size;
}
dst_offsets[i] = static_cast<ColumnString::Offset>(dst_chars.size());
}
}

// ---- Helper: build a ColumnString with random data containing the needle ----
static ColumnString::MutablePtr build_test_column(size_t num_rows, size_t avg_len,
const std::string& needle, double hit_rate,
unsigned seed = 42) {
auto col = ColumnString::create();
std::mt19937 rng(seed);
std::uniform_int_distribution<int> char_dist('a', 'z');
std::uniform_real_distribution<double> hit_dist(0.0, 1.0);
std::uniform_int_distribution<size_t> len_dist(avg_len / 2, avg_len * 3 / 2);

for (size_t r = 0; r < num_rows; ++r) {
size_t len = len_dist(rng);
std::string s;
s.reserve(len + needle.size() * 3);
size_t written = 0;
while (written < len) {
if (!needle.empty() && hit_dist(rng) < hit_rate && written + needle.size() <= len) {
s += needle;
written += needle.size();
} else {
s += static_cast<char>(char_dist(rng));
++written;
}
}
col->insert_data(s.data(), s.size());
}
return col;
}

// -------- Benchmarks --------

// Small strings, high hit rate, many rows
static void BM_Replace_Old_SmallStr(benchmark::State& state) {
std::string needle = "abc";
std::string replacement = "XY";
auto src = build_test_column(10000, 50, needle, 0.1);
for (auto _ : state) {
auto dst = ColumnString::create();
replace_old_column(*src, needle, replacement, *dst);
benchmark::DoNotOptimize(dst);
}
}

static void BM_Replace_New_SmallStr(benchmark::State& state) {
std::string needle = "abc";
std::string replacement = "XY";
auto src = build_test_column(10000, 50, needle, 0.1);
for (auto _ : state) {
auto dst = ColumnString::create();
replace_new_column(*src, needle, replacement, *dst);
benchmark::DoNotOptimize(dst);
}
}

// Medium strings, moderate hit rate
static void BM_Replace_Old_MedStr(benchmark::State& state) {
std::string needle = "hello";
std::string replacement = "world!";
auto src = build_test_column(5000, 200, needle, 0.05);
for (auto _ : state) {
auto dst = ColumnString::create();
replace_old_column(*src, needle, replacement, *dst);
benchmark::DoNotOptimize(dst);
}
}

static void BM_Replace_New_MedStr(benchmark::State& state) {
std::string needle = "hello";
std::string replacement = "world!";
auto src = build_test_column(5000, 200, needle, 0.05);
for (auto _ : state) {
auto dst = ColumnString::create();
replace_new_column(*src, needle, replacement, *dst);
benchmark::DoNotOptimize(dst);
}
}

// Large strings, low hit rate
static void BM_Replace_Old_LargeStr(benchmark::State& state) {
std::string needle = "pattern";
std::string replacement = "REPLACED";
auto src = build_test_column(1000, 1000, needle, 0.02);
for (auto _ : state) {
auto dst = ColumnString::create();
replace_old_column(*src, needle, replacement, *dst);
benchmark::DoNotOptimize(dst);
}
}

static void BM_Replace_New_LargeStr(benchmark::State& state) {
std::string needle = "pattern";
std::string replacement = "REPLACED";
auto src = build_test_column(1000, 1000, needle, 0.02);
for (auto _ : state) {
auto dst = ColumnString::create();
replace_new_column(*src, needle, replacement, *dst);
benchmark::DoNotOptimize(dst);
}
}

// No matches (needle not present) — measures search overhead
static void BM_Replace_Old_NoMatch(benchmark::State& state) {
std::string needle = "ZZZZZZ";
std::string replacement = "X";
auto src = build_test_column(10000, 100, "abc", 0.0); // no ZZZZZZ in data
for (auto _ : state) {
auto dst = ColumnString::create();
replace_old_column(*src, needle, replacement, *dst);
benchmark::DoNotOptimize(dst);
}
}

static void BM_Replace_New_NoMatch(benchmark::State& state) {
std::string needle = "ZZZZZZ";
std::string replacement = "X";
auto src = build_test_column(10000, 100, "abc", 0.0);
for (auto _ : state) {
auto dst = ColumnString::create();
replace_new_column(*src, needle, replacement, *dst);
benchmark::DoNotOptimize(dst);
}
}

BENCHMARK(BM_Replace_Old_SmallStr);
BENCHMARK(BM_Replace_New_SmallStr);
BENCHMARK(BM_Replace_Old_MedStr);
BENCHMARK(BM_Replace_New_MedStr);
BENCHMARK(BM_Replace_Old_LargeStr);
BENCHMARK(BM_Replace_New_LargeStr);
BENCHMARK(BM_Replace_Old_NoMatch);
BENCHMARK(BM_Replace_New_NoMatch);

} // namespace doris
82 changes: 82 additions & 0 deletions be/src/exprs/function/function_string_replace.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include "core/data_type/data_type_number.h"
#include "core/data_type/data_type_string.h"
#include "core/string_ref.h"
#include "exec/common/string_searcher.h"
#include "exec/common/stringop_substring.h"
#include "exec/common/template_helpers.hpp"
#include "exprs/function/function.h"
Expand Down Expand Up @@ -87,6 +88,23 @@ class FunctionReplace : public IFunction {

ColumnString::MutablePtr col_res = ColumnString::create();

// Fast path: when old_str and new_str are both constant and old_str is
// non-empty (the common case for replace(col, 'literal', 'literal')).
// Works directly on ColumnString chars/offsets to avoid per-row
// std::string allocation and copy overhead.
// Applies to both replace (empty=true) and replace_empty (empty=false):
// when old_str is non-empty the two variants behave identically.
if (col_const[1] && col_const[2]) {
StringRef old_ref = col_old_str->get_data_at(0);
StringRef new_ref = col_new_str->get_data_at(0);
if (old_ref.size > 0) {
_replace_const_pattern(*col_origin_str, old_ref, new_ref, *col_res,
input_rows_count, col_const[0]);
block.replace_by_position(result, std::move(col_res));
return Status::OK();
}
}

std::visit(
[&](auto origin_str_const, auto old_str_const, auto new_str_const) {
for (int i = 0; i < input_rows_count; ++i) {
Expand All @@ -112,6 +130,70 @@ class FunctionReplace : public IFunction {
}

private:
// Optimized replace path for constant old_str (non-empty) and constant new_str.
// Avoids per-row std::string allocation by working directly on ColumnString
// chars/offsets. Two-level search strategy:
// 1. memchr (glibc AVX512) scans for the needle's first byte. If absent,
// the row is guaranteed no-match and is bulk-copied with a single memcpy.
// 2. When the first byte is present, ASCIICaseSensitiveStringSearcher
// (SSE4.1, prebuilt once outside the row loop) does the full needle scan.
static void _replace_const_pattern(const ColumnString& src, StringRef old_ref,
StringRef new_ref, ColumnString& dst,
size_t input_rows_count, bool src_const) {
auto& dst_chars = dst.get_chars();
auto& dst_offsets = dst.get_offsets();

dst_chars.reserve(src_const ? (src.get_data_at(0).size * input_rows_count)
: src.get_chars().size());
dst_offsets.resize(input_rows_count);

// Build SSE4.1 searcher once — first+second byte masks precomputed here.
ASCIICaseSensitiveStringSearcher searcher(old_ref.data, old_ref.size);
const size_t needle_size = old_ref.size;
const size_t replacement_size = new_ref.size;
const char* replacement_data = new_ref.data;
const auto needle_first = static_cast<unsigned char>(old_ref.data[0]);

for (size_t i = 0; i < input_rows_count; ++i) {
StringRef row = src.get_data_at(src_const ? 0 : i);
const char* const row_end = row.data + row.size;

// Level-1: memchr for needle's first byte (glibc uses AVX512 internally).
// If the first byte is absent the entire row cannot contain the needle;
// bulk-copy it and move to the next row without entering the SSE4.1 loop.
if (memchr(row.data, needle_first, row.size) == nullptr) {
StringOP::push_value_string({row.data, row.size}, i, dst_chars, dst_offsets);
continue;
}

// Level-2: SSE4.1 searcher handles needle matching for this row.
const char* pos = row.data;
while (pos < row_end) {
const char* match = searcher.search(pos, row_end);
// Copy prefix before match
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This fast path bypasses ColumnString::insert_data() / StringOP::push_value_string() and never calls ColumnString::check_chars_length() before growing dst_chars. The old implementation would raise STRING_OVERFLOW_IN_VEC_ENGINE once the result column exceeded 4 GiB, but this path will resize() past that and then truncate offsets when push_empty_string() stores chars.size() into the 32-bit offsets array. A query that expands strings significantly can therefore silently corrupt results instead of failing. Please preserve the overflow check before each manual resize() (or validate the final row growth up front) so the fast path matches the existing ColumnString contract.

size_t prefix_len = static_cast<size_t>(match - pos);
if (prefix_len > 0) {
size_t old_size = dst_chars.size();
ColumnString::check_chars_length(old_size + prefix_len, i + 1);
dst_chars.resize(old_size + prefix_len);
memcpy(&dst_chars[old_size], pos, prefix_len);
}
if (match == row_end) {
break;
}
// Copy replacement
if (replacement_size > 0) {
size_t old_size = dst_chars.size();
ColumnString::check_chars_length(old_size + replacement_size, i + 1);
dst_chars.resize(old_size + replacement_size);
memcpy(&dst_chars[old_size], replacement_data, replacement_size);
}
pos = match + needle_size;
}
StringOP::push_empty_string(i, dst_chars, dst_offsets);
}
}

std::string replace(std::string str, std::string_view old_str, std::string_view new_str) const {
if (old_str.empty()) {
if constexpr (empty) {
Expand Down
Loading