diff --git a/be/benchmark/benchmark_main.cpp b/be/benchmark/benchmark_main.cpp index 21e629e3d6f92b..f137dc3cad3a50 100644 --- a/be/benchmark/benchmark_main.cpp +++ b/be/benchmark/benchmark_main.cpp @@ -25,6 +25,7 @@ #include "benchmark_hll_merge.hpp" #include "benchmark_hybrid_set.hpp" #include "benchmark_string.hpp" +#include "benchmark_zone_map_index.hpp" #include "binary_cast_benchmark.hpp" #include "core/block/block.h" #include "core/column/column_string.h" diff --git a/be/benchmark/benchmark_zone_map_index.hpp b/be/benchmark/benchmark_zone_map_index.hpp new file mode 100644 index 00000000000000..2fe0d41733b9a2 --- /dev/null +++ b/be/benchmark/benchmark_zone_map_index.hpp @@ -0,0 +1,257 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// ============================================================ +// Benchmark: TypedZoneMapIndexWriter::add_values +// +// Measures CPU cost of feeding values into the per-page zone-map +// builder for a few representative primitive types and call sizes. +// ============================================================ + +#pragma once + +#include + +#include +#include +#include +#include +#include + +#include "core/data_type/data_type_factory.hpp" +#include "core/string_ref.h" +#include "storage/field.h" +#include "storage/index/zone_map/zone_map_index.h" +#include "storage/tablet/tablet_schema.h" +#include "util/slice.h" + +namespace doris::segment_v2 { + +namespace bench_zone_map { + +constexpr size_t kTotalRows = 1 << 16; // 65536 rows fed per iteration +constexpr size_t kStoragePageSize = 65536; // STORAGE_PAGE_SIZE_DEFAULT_VALUE + +inline std::vector gen_int32(size_t n) { + std::mt19937 rng(0xC0FFEE); + std::uniform_int_distribution d(-1'000'000, 1'000'000); + std::vector v(n); + for (auto& x : v) x = d(rng); + return v; +} +inline std::vector gen_int64(size_t n) { + std::mt19937_64 rng(0xC0FFEE); + std::uniform_int_distribution d(-1'000'000'000LL, 1'000'000'000LL); + std::vector v(n); + for (auto& x : v) x = d(rng); + return v; +} +inline std::vector gen_double(size_t n) { + std::mt19937 rng(0xC0FFEE); + std::uniform_real_distribution d(-1e6, 1e6); + std::vector v(n); + for (auto& x : v) x = d(rng); + return v; +} +// Build a contiguous string buffer + Slice array (ValType for string is StringRef/Slice). +struct StringBatch { + std::vector data; + std::vector slices; +}; +inline StringBatch gen_strings(size_t n, size_t avg_len = 16) { + StringBatch b; + b.data.reserve(n); + b.slices.reserve(n); + std::mt19937 rng(0xC0FFEE); + std::uniform_int_distribution ch('a', 'z'); + for (size_t i = 0; i < n; ++i) { + std::string s(avg_len, 'a'); + for (auto& c : s) c = static_cast(ch(rng)); + b.data.emplace_back(std::move(s)); + } + for (auto& s : b.data) b.slices.emplace_back(s.data(), s.size()); + return b; +} + +inline TabletColumnPtr make_column(FieldType ft, int32_t length, int32_t index_length) { + auto c = std::make_shared(); + c->_unique_id = 0; + c->_col_name = "c"; + c->_type = ft; + c->_is_key = true; + c->_is_nullable = false; + c->_length = length; + c->_index_length = index_length; + return c; +} + +template +std::unique_ptr make_writer() { + TabletColumnPtr col; + DataTypePtr dtype; + if constexpr (PType == TYPE_INT) { + col = make_column(FieldType::OLAP_FIELD_TYPE_INT, 4, 4); + dtype = DataTypeFactory::instance().create_data_type(TYPE_INT, false); + } else if constexpr (PType == TYPE_BIGINT) { + col = make_column(FieldType::OLAP_FIELD_TYPE_BIGINT, 8, 8); + dtype = DataTypeFactory::instance().create_data_type(TYPE_BIGINT, false); + } else if constexpr (PType == TYPE_DOUBLE) { + col = make_column(FieldType::OLAP_FIELD_TYPE_DOUBLE, 8, 8); + dtype = DataTypeFactory::instance().create_data_type(TYPE_DOUBLE, false); + } else if constexpr (PType == TYPE_VARCHAR) { + col = make_column(FieldType::OLAP_FIELD_TYPE_VARCHAR, 64, 1); + dtype = DataTypeFactory::instance().create_data_type(TYPE_VARCHAR, false, 0, 0, 64); + } + std::unique_ptr field(StorageFieldFactory::create(*col)); + std::unique_ptr w; + (void)ZoneMapIndexWriter::create(dtype, field.get(), w); + return w; +} + +template +void run(benchmark::State& state, const Vec& values) { + const size_t batch = static_cast(state.range(0)); + const size_t total = values.size(); + for (auto _ : state) { + auto w = make_writer(); + size_t off = 0; + while (off < total) { + size_t n = std::min(batch, total - off); + w->add_values(reinterpret_cast(&values[off]), n); + off += n; + } + (void)w->flush(); + benchmark::DoNotOptimize(w); + } + state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(total)); +} + +// Simulates the ScalarColumnWriter call pattern in compaction: +// - merge iterator hands `block_rows`-row blocks to ColumnWriter::append +// - column_writer chunks each block by page remaining capacity and calls +// add_values() per chunk +// - when a page is full, finish_current_page() calls flush() on the zone +// map builder, then a new page begins +// +// Compaction batch_size is computed dynamically as +// `block_mem_limit / group_data_size` clamped to [32, 4064] +// (be/src/storage/merger.cpp:458). For wide rows / variant data it routinely +// drops to the low end (32 - 256), which is the case the flame graph exposes. +template +void run_column_writer_like(benchmark::State& state, const Vec& values, size_t elem_size) { + const size_t block_rows = static_cast(state.range(0)); + const size_t page_capacity = kStoragePageSize / elem_size; // e.g. 16384 for int32 + const size_t total = values.size(); + for (auto _ : state) { + auto w = make_writer(); + size_t off = 0; + size_t page_used = 0; + while (off < total) { + size_t block_left = std::min(block_rows, total - off); + while (block_left > 0) { + size_t n = std::min(block_left, page_capacity - page_used); + w->add_values(reinterpret_cast(&values[off]), n); + off += n; + block_left -= n; + page_used += n; + if (page_used == page_capacity) { + (void)w->flush(); + page_used = 0; + } + } + } + if (page_used) (void)w->flush(); + benchmark::DoNotOptimize(w); + } + state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(total)); +} + +static void BM_ZoneMap_Int32(benchmark::State& state) { + static auto vals = gen_int32(kTotalRows); + run(state, vals); +} +static void BM_ZoneMap_Int64(benchmark::State& state) { + static auto vals = gen_int64(kTotalRows); + run(state, vals); +} +static void BM_ZoneMap_Double(benchmark::State& state) { + static auto vals = gen_double(kTotalRows); + run(state, vals); +} +static void BM_ZoneMap_String(benchmark::State& state) { + static auto batch = gen_strings(kTotalRows, 16); + run(state, batch.slices); +} + +BENCHMARK(BM_ZoneMap_Int32)->Arg(1)->Arg(64)->Arg(1024); +BENCHMARK(BM_ZoneMap_Int64)->Arg(1)->Arg(64)->Arg(1024); +BENCHMARK(BM_ZoneMap_Double)->Arg(1)->Arg(64)->Arg(1024); +BENCHMARK(BM_ZoneMap_String)->Arg(1)->Arg(64)->Arg(1024); + +// Realistic compaction-shaped: 1024-row blocks + page-driven flush(). +static void BM_ZoneMap_ColWriter_Int32(benchmark::State& state) { + static auto vals = gen_int32(kTotalRows); + run_column_writer_like(state, vals, sizeof(int32_t)); +} +static void BM_ZoneMap_ColWriter_Int64(benchmark::State& state) { + static auto vals = gen_int64(kTotalRows); + run_column_writer_like(state, vals, sizeof(int64_t)); +} +static void BM_ZoneMap_ColWriter_Double(benchmark::State& state) { + static auto vals = gen_double(kTotalRows); + run_column_writer_like(state, vals, sizeof(double)); +} +static void BM_ZoneMap_ColWriter_String(benchmark::State& state) { + static auto batch = gen_strings(kTotalRows, 16); + // For strings the page packs (size+payload); use ~32B avg per element. + run_column_writer_like(state, batch.slices, 32); +} +BENCHMARK(BM_ZoneMap_ColWriter_Int32) + ->Arg(1) + ->Arg(4) + ->Arg(16) + ->Arg(64) + ->Arg(256) + ->Arg(1024) + ->Arg(4096); +BENCHMARK(BM_ZoneMap_ColWriter_Int64) + ->Arg(1) + ->Arg(4) + ->Arg(16) + ->Arg(64) + ->Arg(256) + ->Arg(1024) + ->Arg(4096); +BENCHMARK(BM_ZoneMap_ColWriter_Double) + ->Arg(1) + ->Arg(4) + ->Arg(16) + ->Arg(64) + ->Arg(256) + ->Arg(1024) + ->Arg(4096); +BENCHMARK(BM_ZoneMap_ColWriter_String) + ->Arg(1) + ->Arg(4) + ->Arg(16) + ->Arg(64) + ->Arg(256) + ->Arg(1024) + ->Arg(4096); + +} // namespace bench_zone_map +} // namespace doris::segment_v2 diff --git a/be/src/storage/index/zone_map/zone_map_index.cpp b/be/src/storage/index/zone_map/zone_map_index.cpp index a503dd746431d5..b857e04da047f9 100644 --- a/be/src/storage/index/zone_map/zone_map_index.cpp +++ b/be/src/storage/index/zone_map/zone_map_index.cpp @@ -122,17 +122,47 @@ TypedZoneMapIndexWriter::TypedZoneMapIndexWriter(DataTypePtr&& data_type) template void TypedZoneMapIndexWriter::_update_page_zonemap(const ValType& min_value, const ValType& max_value) { - auto min_field = doris::Field::create_field_from_olap_value(min_value); - auto max_field = doris::Field::create_field_from_olap_value(max_value); - if (!_page_zone_map.has_not_null || min_field < _page_zone_map.min_value) { - _page_zone_map.min_value = std::move(min_field); - } - if (!_page_zone_map.has_not_null || max_field > _page_zone_map.max_value) { - _page_zone_map.max_value = std::move(max_field); + // Hot path: compare/store using raw CppType to avoid Field temporaries. + // For string types, truncate to MAX_ZONE_MAP_INDEX_SIZE (matching the old + // Field-based path) and copy bytes into the arena so the StringRef stays + // valid across add_values() calls. + if constexpr (is_string_type(Type)) { + constexpr size_t kMaxZoneMapSize = MAX_ZONE_MAP_INDEX_SIZE; + auto truncate_and_store = [&](const StringRef& src) { + auto sz = std::min(src.size, kMaxZoneMapSize); + char* buf = _arena.alloc(sz); + memcpy(buf, src.data, sz); + return StringRef(buf, sz); + }; + StringRef min_t(min_value.data, std::min(min_value.size, kMaxZoneMapSize)); + StringRef max_t(max_value.data, std::min(max_value.size, kMaxZoneMapSize)); + if (!_page_has_minmax || min_t < _page_min) { + _page_min = truncate_and_store(min_value); + } + if (!_page_has_minmax || _page_max < max_t) { + _page_max = truncate_and_store(max_value); + } + } else { + if (!_page_has_minmax || min_value < _page_min) { + _page_min = min_value; + } + if (!_page_has_minmax || max_value > _page_max) { + _page_max = max_value; + } } + _page_has_minmax = true; _page_zone_map.has_not_null = true; } +template +void TypedZoneMapIndexWriter::_materialize_page_minmax() { + if (!_page_has_minmax) { + return; + } + _page_zone_map.min_value = doris::Field::create_field_from_olap_value(_page_min); + _page_zone_map.max_value = doris::Field::create_field_from_olap_value(_page_max); +} + template void TypedZoneMapIndexWriter::add_values(const void* values, size_t count) { if (count == 0) { @@ -195,14 +225,20 @@ void TypedZoneMapIndexWriter::invalid_page_zone_map() { template Status TypedZoneMapIndexWriter::flush() { + // Materialize the running CppType min/max into the Field-typed page zone map + // before merging into the segment zone map / serializing to proto. + _materialize_page_minmax(); + // Update segment zone map. - if (!_segment_zone_map.has_not_null || - _segment_zone_map.min_value.get() > _page_zone_map.min_value.get()) { - _segment_zone_map.min_value = _page_zone_map.min_value; - } - if (!_segment_zone_map.has_not_null || - _segment_zone_map.max_value.get() < _page_zone_map.max_value.get()) { - _segment_zone_map.max_value = _page_zone_map.max_value; + if (_page_has_minmax) { + if (!_segment_zone_map.has_not_null || + _segment_zone_map.min_value.get() > _page_zone_map.min_value.get()) { + _segment_zone_map.min_value = _page_zone_map.min_value; + } + if (!_segment_zone_map.has_not_null || + _segment_zone_map.max_value.get() < _page_zone_map.max_value.get()) { + _segment_zone_map.max_value = _page_zone_map.max_value; + } } if (_page_zone_map.has_null) { _segment_zone_map.has_null = true; @@ -224,6 +260,10 @@ Status TypedZoneMapIndexWriter::flush() { modify_index_before_flush(_page_zone_map); _page_zone_map.to_proto(&zone_map_pb, _data_type); _reset_zone_map(&_page_zone_map); + _page_has_minmax = false; + _page_min = ValType(); + _page_max = ValType(); + _arena.clear(); std::string serialized_zone_map; bool ret = zone_map_pb.SerializeToString(&serialized_zone_map); diff --git a/be/src/storage/index/zone_map/zone_map_index.h b/be/src/storage/index/zone_map/zone_map_index.h index e10683acb9cc33..448898abf0ee3a 100644 --- a/be/src/storage/index/zone_map/zone_map_index.h +++ b/be/src/storage/index/zone_map/zone_map_index.h @@ -30,6 +30,7 @@ #include "core/arena.h" #include "core/data_type/data_type.h" #include "core/data_type/define_primitive_type.h" +#include "core/string_ref.h" #include "io/fs/file_reader_writer_fwd.h" #include "storage/field.h" #include "storage/metadata_adder.h" @@ -148,10 +149,20 @@ class TypedZoneMapIndexWriter final : public ZoneMapIndexWriter { void _update_page_zonemap(const ValType& min_value, const ValType& max_value); + // Materialize the running CppType min/max into _page_zone_map.{min,max}_value. + // Called at flush() time, so the per-row hot path never constructs a Field. + void _materialize_page_minmax(); + DataTypePtr _data_type; // memory will be managed by Arena ZoneMap _page_zone_map; ZoneMap _segment_zone_map; + // Running min/max for the current page kept as raw ValType (avoids per-row + // Field temporaries). For string types, the bytes are copied into _arena so + // the StringRef stays valid across add_values() calls. + ValType _page_min {}; + ValType _page_max {}; + bool _page_has_minmax = false; // TODO(zc): we should replace this arena later, we only allocate min/max // for field. But Arena allocate 4KB least, it will a waste for most cases. Arena _arena;