diff --git a/CMakeLists.txt b/CMakeLists.txt index 124d6c540..0e0548c52 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,6 +45,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON) option(PAIMON_BUILD_STATIC "Build static library" ON) option(PAIMON_BUILD_SHARED "Build shared library" ON) option(PAIMON_BUILD_TESTS "Build tests" OFF) +option(PAIMON_BUILD_BENCHMARKS "Build benchmarks" OFF) option(PAIMON_USE_ASAN "Use Address Sanitizer" OFF) option(PAIMON_USE_UBSAN "Use Undefined Behavior Sanitizer" OFF) option(PAIMON_USE_CXX11_ABI "Use C++11 ABI" ON) @@ -354,6 +355,29 @@ endif() set(ENV{PAIMON_TEST_DATA} "${CMAKE_SOURCE_DIR}/test/test_data") +if(PAIMON_BUILD_TESTS OR PAIMON_BUILD_BENCHMARKS) + resolve_dependency(GTest) + include_directories(SYSTEM ${GTEST_INCLUDE_DIR}) + + paimon_link_libraries_whole_archive(PAIMON_LOCAL_FILE_SYSTEM_STATIC_LINK_LIBS + paimon_local_file_system_static) + paimon_link_libraries_no_as_needed(PAIMON_LOCAL_FILE_SYSTEM_SHARED_LINK_LIBS + paimon_local_file_system_shared) + paimon_link_libraries_whole_archive(PAIMON_BLOB_FILE_FORMAT_STATIC_LINK_LIBS + paimon_blob_file_format_static) + paimon_link_libraries_whole_archive(PAIMON_PARQUET_FILE_FORMAT_STATIC_LINK_LIBS + paimon_parquet_file_format_static) + + if(PAIMON_ENABLE_ORC) + paimon_link_libraries_whole_archive(PAIMON_ORC_FILE_FORMAT_STATIC_LINK_LIBS + paimon_orc_file_format_static) + endif() + if(PAIMON_ENABLE_AVRO) + paimon_link_libraries_whole_archive(PAIMON_AVRO_FILE_FORMAT_STATIC_LINK_LIBS + paimon_avro_file_format_static) + endif() +endif() + if(PAIMON_BUILD_TESTS) if(NOT PAIMON_ENABLE_ORC) message(FATAL_ERROR "PAIMON_ENABLE_ORC must be enabled if PAIMON_BUILD_TESTS is enable" @@ -365,7 +389,6 @@ if(PAIMON_BUILD_TESTS) endif() # Adding unit tests part of the "paimon" portion of the test suite add_custom_target(paimon-tests) - resolve_dependency(GTest) add_custom_target(unittest ctest @@ -375,7 +398,6 @@ if(PAIMON_BUILD_TESTS) --output-on-failure) add_dependencies(unittest paimon-tests) - include_directories(SYSTEM ${GTEST_INCLUDE_DIR}) include_directories("${CMAKE_SOURCE_DIR}/test/") paimon_link_libraries_whole_archive( @@ -388,15 +410,6 @@ if(PAIMON_BUILD_TESTS) TEST_PLUGIN_LINK_LIBS paimon_parquet_file_format_shared paimon_blob_file_format_shared) set(TEST_STATIC_LINK_LIBS ${TEST_WHOLE_ARCHIVE_LINK_LIBS} ${TEST_PLUGIN_LINK_LIBS}) - paimon_link_libraries_whole_archive(PAIMON_LOCAL_FILE_SYSTEM_STATIC_LINK_LIBS - paimon_local_file_system_static) - paimon_link_libraries_no_as_needed(PAIMON_LOCAL_FILE_SYSTEM_SHARED_LINK_LIBS - paimon_local_file_system_shared) - paimon_link_libraries_whole_archive(PAIMON_BLOB_FILE_FORMAT_STATIC_LINK_LIBS - paimon_blob_file_format_static) - paimon_link_libraries_whole_archive(PAIMON_PARQUET_FILE_FORMAT_STATIC_LINK_LIBS - paimon_parquet_file_format_static) - if(PAIMON_ENABLE_LANCE) paimon_link_libraries_whole_archive(PAIMON_LANCE_FILE_FORMAT_STATIC_LINK_LIBS paimon_lance_file_format_static) @@ -405,15 +418,11 @@ if(PAIMON_BUILD_TESTS) list(APPEND TEST_STATIC_LINK_LIBS ${TEST_PLUGIN_LINK_LIBS}) endif() if(PAIMON_ENABLE_ORC) - paimon_link_libraries_whole_archive(PAIMON_ORC_FILE_FORMAT_STATIC_LINK_LIBS - paimon_orc_file_format_static) paimon_link_libraries_no_as_needed(TEST_PLUGIN_LINK_LIBS paimon_orc_file_format_shared) list(APPEND TEST_STATIC_LINK_LIBS ${TEST_PLUGIN_LINK_LIBS}) endif() if(PAIMON_ENABLE_AVRO) - paimon_link_libraries_whole_archive(PAIMON_AVRO_FILE_FORMAT_STATIC_LINK_LIBS - paimon_avro_file_format_static) paimon_link_libraries_no_as_needed(TEST_PLUGIN_LINK_LIBS paimon_avro_file_format_shared) list(APPEND TEST_STATIC_LINK_LIBS ${TEST_PLUGIN_LINK_LIBS}) @@ -441,6 +450,19 @@ if(PAIMON_BUILD_TESTS) endif() endif() +if(PAIMON_BUILD_BENCHMARKS) + add_custom_target(paimon-benchmarks) + add_custom_target(benchmark + ctest + -j4 + -L + benchmark + --output-on-failure) + add_dependencies(benchmark paimon-benchmarks) + + set(PAIMON_BENCHMARK_LINK_TOOLCHAIN benchmark::benchmark) +endif() + paimon_print_dependency_resolution_summary() include(CMakePackageConfigHelpers) @@ -472,3 +494,4 @@ add_subdirectory(src/paimon/global_index/lucene) add_subdirectory(src/paimon/testing/mock) add_subdirectory(src/paimon/testing/utils) add_subdirectory(test/inte) +add_subdirectory(benchmark) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt new file mode 100644 index 000000000..c7c58ac37 --- /dev/null +++ b/benchmark/CMakeLists.txt @@ -0,0 +1,75 @@ +# Copyright 2026-present Alibaba Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT PAIMON_BUILD_BENCHMARKS AND NOT PAIMON_BUILD_TESTS) + return() +endif() + +find_package(Threads REQUIRED) + +set(PAIMON_BENCHMARK_STATIC_LINK_LIBS + paimon_shared ${PAIMON_LOCAL_FILE_SYSTEM_SHARED_LINK_LIBS} + ${PAIMON_PARQUET_FILE_FORMAT_STATIC_LINK_LIBS} + ${PAIMON_BLOB_FILE_FORMAT_STATIC_LINK_LIBS}) + +if(PAIMON_ENABLE_ORC) + list(APPEND PAIMON_BENCHMARK_STATIC_LINK_LIBS + ${PAIMON_ORC_FILE_FORMAT_STATIC_LINK_LIBS}) +endif() + +if(PAIMON_ENABLE_AVRO) + list(APPEND PAIMON_BENCHMARK_STATIC_LINK_LIBS + ${PAIMON_AVRO_FILE_FORMAT_STATIC_LINK_LIBS}) +endif() + +set(PAIMON_BENCHMARK_PLATFORM_LINK_LIBS) +if(UNIX AND NOT APPLE) + find_library(PAIMON_BENCHMARK_RT_LIBRARY rt) + if(PAIMON_BENCHMARK_RT_LIBRARY) + list(APPEND PAIMON_BENCHMARK_PLATFORM_LINK_LIBS ${PAIMON_BENCHMARK_RT_LIBRARY}) + endif() +endif() + +if(PAIMON_BUILD_BENCHMARKS) + add_paimon_benchmark(read_write_benchmark + SOURCES + benchmark_helpers.cpp + benchmark_suite.cpp + benchmark_case_write.cpp + benchmark_case_read.cpp + benchmark_case_pk_write.cpp + benchmark_case_mor_read.cpp + read_write_benchmark.cpp + STATIC_LINK_LIBS + arrow + parquet + ${PAIMON_BENCHMARK_STATIC_LINK_LIBS} + test_utils_static + Threads::Threads + ${CMAKE_DL_LIBS} + ${PAIMON_BENCHMARK_PLATFORM_LINK_LIBS} + ${PAIMON_BENCHMARK_LINK_TOOLCHAIN} + EXTRA_INCLUDES + ${CMAKE_SOURCE_DIR}) +endif() + +if(PAIMON_BUILD_TESTS) + add_paimon_test(cli_option_parsing_test + SOURCES + cli_option_parsing_test.cpp + EXTRA_INCLUDES + ${CMAKE_SOURCE_DIR} + STATIC_LINK_LIBS + ${GTEST_LINK_TOOLCHAIN}) +endif() diff --git a/benchmark/benchmark_case_mor_read.cpp b/benchmark/benchmark_case_mor_read.cpp new file mode 100644 index 000000000..d96c7e3e0 --- /dev/null +++ b/benchmark/benchmark_case_mor_read.cpp @@ -0,0 +1,33 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark/benchmark_suite.h" + +namespace { + +void BM_MOR_Read(::benchmark::State& state) { + paimon::benchmark::RunBMMorRead(state); +} + +} // namespace + +BENCHMARK(BM_MOR_Read) + ->ArgNames({"prefetch_parallel"}) + ->Unit(benchmark::kMillisecond) + ->UseRealTime() + ->Args({1}) + ->Args({2}) + ->Args({4}); diff --git a/benchmark/benchmark_case_pk_write.cpp b/benchmark/benchmark_case_pk_write.cpp new file mode 100644 index 000000000..d18a71d39 --- /dev/null +++ b/benchmark/benchmark_case_pk_write.cpp @@ -0,0 +1,27 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark/benchmark_suite.h" + +namespace { + +void BM_PK_Write(::benchmark::State& state) { + paimon::benchmark::RunBMPkWrite(state); +} + +} // namespace + +BENCHMARK(BM_PK_Write)->Unit(benchmark::kMillisecond)->UseRealTime(); diff --git a/benchmark/benchmark_case_read.cpp b/benchmark/benchmark_case_read.cpp new file mode 100644 index 000000000..71b528d18 --- /dev/null +++ b/benchmark/benchmark_case_read.cpp @@ -0,0 +1,33 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark/benchmark_suite.h" + +namespace { + +void BM_Read(::benchmark::State& state) { + paimon::benchmark::RunBMRead(state); +} + +} // namespace + +BENCHMARK(BM_Read) + ->ArgNames({"prefetch_parallel"}) + ->Unit(benchmark::kMillisecond) + ->UseRealTime() + ->Args({1}) + ->Args({2}) + ->Args({4}); diff --git a/benchmark/benchmark_case_write.cpp b/benchmark/benchmark_case_write.cpp new file mode 100644 index 000000000..4bf34695a --- /dev/null +++ b/benchmark/benchmark_case_write.cpp @@ -0,0 +1,27 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark/benchmark_suite.h" + +namespace { + +void BM_Write(::benchmark::State& state) { + paimon::benchmark::RunBMWrite(state); +} + +} // namespace + +BENCHMARK(BM_Write)->Unit(benchmark::kMillisecond)->UseRealTime(); diff --git a/benchmark/benchmark_helpers.cpp b/benchmark/benchmark_helpers.cpp new file mode 100644 index 000000000..f4c409d0b --- /dev/null +++ b/benchmark/benchmark_helpers.cpp @@ -0,0 +1,89 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark/benchmark_helpers.h" + +#include + +#include "benchmark/benchmark.h" + +namespace paimon::benchmark { + +bool BenchmarkHelpers::ValidateFileFormatOrSkip(::benchmark::State& state, + const std::string& file_format, bool is_supported, + SkipFn skip) { + if (!is_supported) { + skip(state, "file format is not supported in this build: " + file_format); + return false; + } + return true; +} + +bool BenchmarkHelpers::ValidateSourcePresenceOrSkip(::benchmark::State& state, + const std::string& source_path, + const std::string& message, SkipFn skip) { + if (source_path.empty()) { + skip(state, message); + return false; + } + return true; +} + +bool BenchmarkHelpers::ValidateSourceSupportOrSkip(::benchmark::State& state, + const std::string& source_format, + bool is_supported, SkipFn skip) { + if (!is_supported) { + skip(state, + "source data mode requires reader support in this build for format: " + source_format); + return false; + } + return true; +} + +bool BenchmarkHelpers::ValidatePrefetchParallelOrSkip(::benchmark::State& state, + int32_t prefetch_parallel_num, SkipFn skip) { + if (prefetch_parallel_num <= 0) { + skip(state, "prefetch_parallel must be greater than 0"); + return false; + } + return true; +} + +int64_t BenchmarkHelpers::RunReadIterations(::benchmark::State& state, + const ReadOnceFn& read_once) { + int64_t rows_read = 0; + for (auto _ : state) { + rows_read = read_once(); + } + return rows_read; +} + +bool BenchmarkHelpers::TryRunExternalReadMode(::benchmark::State& state, + const std::string& benchmark_name, + const std::string& external_table_path, + const ReadOnceFn& read_once) { + if (external_table_path.empty()) { + return false; + } + + std::cout << "[benchmark][" << benchmark_name << "] external_table_path=" << external_table_path + << std::endl; + const int64_t rows_read = RunReadIterations(state, read_once); + state.SetItemsProcessed(state.iterations() * rows_read); + return true; +} + +} // namespace paimon::benchmark diff --git a/benchmark/benchmark_helpers.h b/benchmark/benchmark_helpers.h new file mode 100644 index 000000000..39f6eadf9 --- /dev/null +++ b/benchmark/benchmark_helpers.h @@ -0,0 +1,55 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace benchmark { +class State; +} + +namespace paimon::benchmark { + +class BenchmarkHelpers { + public: + using ReadOnceFn = std::function; + using SkipFn = void (*)(::benchmark::State&, const std::string&); + + static bool ValidateFileFormatOrSkip(::benchmark::State& state, const std::string& file_format, + bool is_supported, SkipFn skip); + + static bool ValidateSourcePresenceOrSkip(::benchmark::State& state, + const std::string& source_path, + const std::string& message, SkipFn skip); + + static bool ValidateSourceSupportOrSkip(::benchmark::State& state, + const std::string& source_format, bool is_supported, + SkipFn skip); + + static bool ValidatePrefetchParallelOrSkip(::benchmark::State& state, + int32_t prefetch_parallel_num, SkipFn skip); + + static int64_t RunReadIterations(::benchmark::State& state, const ReadOnceFn& read_once); + + static bool TryRunExternalReadMode(::benchmark::State& state, const std::string& benchmark_name, + const std::string& external_table_path, + const ReadOnceFn& read_once); +}; + +} // namespace paimon::benchmark diff --git a/benchmark/benchmark_suite.cpp b/benchmark/benchmark_suite.cpp new file mode 100644 index 000000000..8c924ba3e --- /dev/null +++ b/benchmark/benchmark_suite.cpp @@ -0,0 +1,844 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark/benchmark_suite.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/api.h" +#include "arrow/c/bridge.h" +#include "arrow/io/api.h" +#include "benchmark/benchmark_helpers.h" +#include "benchmark/cli_option_parsing.h" +#include "paimon/api.h" +#include "paimon/catalog/catalog.h" +#include "paimon/testing/utils/testharness.h" + +#if __has_include("parquet/arrow/reader.h") +#include "parquet/arrow/reader.h" +#include "parquet/file_reader.h" +#define PAIMON_BENCHMARK_HAS_PARQUET_READER 1 +#else +#define PAIMON_BENCHMARK_HAS_PARQUET_READER 0 +#endif + +namespace paimon::benchmark { + +namespace { + +struct BenchmarkCliOptions { + std::string source_data_file; + std::string external_table_path; + std::string file_format = "parquet"; + int64_t source_batch_max_rows = 4096; + int32_t row_to_batch_thread_number = 3; + std::vector pk_columns; + std::vector> extra_options; +}; + +struct SourceDataSpec { + std::string format; + std::string path; +}; + +BenchmarkCliOptions& MutableBenchmarkCliOptions() { + static BenchmarkCliOptions options; + return options; +} + +const BenchmarkCliOptions& GetBenchmarkCliOptions() { + return MutableBenchmarkCliOptions(); +} + +int64_t ParsePositiveInt64(const std::string& value, const std::string& option_name) { + char* end = nullptr; + const auto parsed = std::strtoll(value.c_str(), &end, 10); + if (end == value.c_str() || *end != '\0' || parsed <= 0) { + throw std::runtime_error("invalid " + option_name + ", expected positive integer"); + } + return static_cast(parsed); +} + +int32_t ParsePositiveInt32(const std::string& value, const std::string& option_name) { + const int64_t parsed = ParsePositiveInt64(value, option_name); + if (parsed > std::numeric_limits::max()) { + throw std::runtime_error("invalid " + option_name + ", value is too large"); + } + return static_cast(parsed); +} + +void ParsePaimonBenchmarkCliArgsImpl(int32_t* argc, char** argv) { + auto& options = MutableBenchmarkCliOptions(); + const int32_t parsed_argc = *argc; + int32_t write_index = 1; + for (int32_t arg_index = 1; arg_index < parsed_argc; ++arg_index) { + const std::string arg(argv[arg_index]); + std::string parsed_value; + + if (paimon::benchmark::ParseStringOptionArg(parsed_argc, argv, arg, + "--paimon_source_data_file", &arg_index, + &options.source_data_file)) { + continue; + } + if (paimon::benchmark::ParseStringOptionArg(parsed_argc, argv, arg, + "--paimon_external_table_path", &arg_index, + &options.external_table_path)) { + continue; + } + if (paimon::benchmark::ParseStringOptionArg(parsed_argc, argv, arg, "--paimon_file_format", + &arg_index, &options.file_format)) { + continue; + } + if (paimon::benchmark::ParseStringOptionArg(parsed_argc, argv, arg, + "--paimon_source_batch_max_rows", &arg_index, + &parsed_value)) { + options.source_batch_max_rows = + ParsePositiveInt64(parsed_value, "--paimon_source_batch_max_rows"); + continue; + } + if (paimon::benchmark::ParseStringOptionArg(parsed_argc, argv, arg, + "--paimon_row_to_batch_thread_number", + &arg_index, &parsed_value)) { + options.row_to_batch_thread_number = + ParsePositiveInt32(parsed_value, "--paimon_row_to_batch_thread_number"); + continue; + } + if (paimon::benchmark::ParseCommaSeparatedOptionArg( + parsed_argc, argv, arg, "--paimon_pk_columns", &arg_index, &options.pk_columns)) { + continue; + } + if (paimon::benchmark::ParseDelimitedRepeatableOptionArg( + parsed_argc, argv, arg, "--paimon_option", &arg_index, &options.extra_options)) { + continue; + } + + argv[write_index++] = argv[arg_index]; + } + + *argc = write_index; + argv[write_index] = nullptr; +} + +bool HasHelpFlagImpl(int32_t argc, char** argv) { + for (int32_t arg_index = 1; arg_index < argc; ++arg_index) { + const std::string arg(argv[arg_index]); + if (arg == "-h" || arg == "--help" || arg == "--help=true") { + return true; + } + } + return false; +} + +void PrintPaimonBenchmarkCliHelpImpl() { + std::cout << "Paimon benchmark custom options:\n" + << " --paimon_source_data_file=\n" + << " Required. External source data file used to build benchmark data.\n" + << " Currently supports Parquet source files.\n" + << " Also supports: --paimon_source_data_file \n" + << " --paimon_external_table_path=\n" + << " Optional for BM_Read and BM_MOR_Read. If set, read directly from existing\n" + << " table path and skip source file loading and pre-write stage.\n" + << " Also supports: --paimon_external_table_path \n" + << " --paimon_file_format=\n" + << " Optional. Target table file format. Default: parquet.\n" + << " Also supports: --paimon_file_format \n" + << " --paimon_source_batch_max_rows=\n" + << " Optional. Max rows per source batch. Default: 4096.\n" + << " --paimon_row_to_batch_thread_number=\n" + << " Optional. Row-to-batch thread number for reads. Default: 3.\n" + << " --paimon_pk_columns=\n" + << " Required by BM_PK_Write and BM_MOR_Read.\n" + << " Also supports: --paimon_pk_columns \n" + << " --paimon_option=:;:\n" + << " Optional and repeatable. Pass through table options as-is.\n" + << " Also supports: --paimon_option :;:\n" + << " Note: use quotes in shell, e.g. \"--paimon_option k1:v1;k2:v2\".\n" + << "\n" + << "Example:\n" + << " paimon-read-write-benchmark --paimon_source_data_file /path/data.parquet \\\n" + << " --paimon_file_format parquet --paimon_pk_columns=id \\\n" + << " --paimon_option \"read.batch-size:8192\" --benchmark_filter=BM_Read\n" + << std::endl; +} + +std::unique_ptr CreateBenchmarkWorkspace() { + auto workspace = paimon::test::UniqueTestDirectory::Create(); + if (workspace == nullptr) { + throw std::runtime_error("failed to create benchmark workspace"); + } + return workspace; +} + +uint64_t NextTableId() { + static std::atomic id{0}; + return ++id; +} + +std::string RequirePath(const std::string& root_path, const std::string& db_name, + const std::string& table_name) { + return root_path + "/" + db_name + ".db/" + table_name; +} + +template +T ValueOrThrow(paimon::Result&& result, const std::string& context) { + if (!result.ok()) { + throw std::runtime_error(context + ": " + result.status().ToString()); + } + return std::move(result).value(); +} + +void CheckStatus(const paimon::Status& status, const std::string& context) { + if (!status.ok()) { + throw std::runtime_error(context + ": " + status.ToString()); + } +} + +void SkipWithMessage(::benchmark::State& state, const std::string& message) { + static thread_local std::string owned_message; + owned_message = message; + state.SkipWithError(owned_message.c_str()); +} + +std::string GetConfiguredFileFormat() { + std::string file_format = GetBenchmarkCliOptions().file_format; + for (const auto& kv : GetBenchmarkCliOptions().extra_options) { + if (kv.first == paimon::Options::FILE_FORMAT) { + file_format = kv.second; + } + } + return file_format; +} + +bool IsFileFormatSupported(const std::string& format) { + if (format == "parquet") { + return true; + } + if (format == "orc") { +#ifdef PAIMON_ENABLE_ORC + return true; +#else + return false; +#endif + } + return false; +} + +void ApplyExtraOptions(std::map* options) { + for (const auto& kv : GetBenchmarkCliOptions().extra_options) { + (*options)[kv.first] = kv.second; + } +} + +std::map BuildOptions(const std::string& file_format) { + std::map options = { + {paimon::Options::FILE_FORMAT, file_format}, + }; + ApplyExtraOptions(&options); + return options; +} + +std::map BuildPkOptions(const std::string& file_format) { + auto options = BuildOptions(file_format); + options[paimon::Options::BUCKET] = "1"; + options[paimon::Options::MERGE_ENGINE] = "deduplicate"; + return options; +} + +std::string GetSourceDataFilePath() { + return GetBenchmarkCliOptions().source_data_file; +} + +std::string GetExternalTablePath() { + return GetBenchmarkCliOptions().external_table_path; +} + +const std::vector& GetPkColumns() { + return GetBenchmarkCliOptions().pk_columns; +} + +SourceDataSpec GetSourceDataSpec() { + const std::string source_data_file_path = GetSourceDataFilePath(); + if (!source_data_file_path.empty()) { + return {"parquet", source_data_file_path}; + } + return {"", ""}; +} + +int64_t GetSourceBatchMaxRows() { + return GetBenchmarkCliOptions().source_batch_max_rows; +} + +int32_t GetRowToBatchThreadNumber() { + return GetBenchmarkCliOptions().row_to_batch_thread_number; +} + +bool SupportsParquetSourceDataMode() { +#if PAIMON_BENCHMARK_HAS_PARQUET_READER + return true; +#else + return false; +#endif +} + +bool SupportsSourceDataMode(const std::string& source_format) { + if (source_format == "parquet") { + return SupportsParquetSourceDataMode(); + } + return false; +} + +struct SourceDataMetadata { + std::shared_ptr schema; + int64_t total_rows = 0; + std::string format; + std::string path; +}; + +#if PAIMON_BENCHMARK_HAS_PARQUET_READER +std::unique_ptr OpenParquetSourceReader(const std::string& path) { + auto input = arrow::io::ReadableFile::Open(path); + if (!input.ok()) { + throw std::runtime_error("open Parquet source failed: " + path + ", " + + input.status().ToString()); + } + + std::unique_ptr parquet_reader; + const auto open_status = parquet::arrow::OpenFile( + input.ValueUnsafe(), arrow::default_memory_pool(), &parquet_reader); + if (!open_status.ok()) { + throw std::runtime_error("create Parquet reader failed: " + open_status.ToString()); + } + parquet_reader->set_batch_size(GetSourceBatchMaxRows()); + return parquet_reader; +} +#endif + +const SourceDataMetadata& LoadParquetSourceMetadata(const std::string& path) { +#if !PAIMON_BENCHMARK_HAS_PARQUET_READER + throw std::runtime_error( + "Parquet source data mode requires parquet::arrow reader support in this build"); +#else + static SourceDataMetadata cache; + if (cache.path == path && cache.format == "parquet") { + return cache; + } + + auto parquet_reader = OpenParquetSourceReader(path); + std::shared_ptr schema; + const auto schema_status = parquet_reader->GetSchema(&schema); + if (!schema_status.ok()) { + throw std::runtime_error("read Parquet source schema failed: " + schema_status.ToString()); + } + + const int64_t total_rows = parquet_reader->parquet_reader()->metadata()->num_rows(); + if (total_rows <= 0) { + throw std::runtime_error("Parquet source is empty: " + path); + } + + cache.schema = std::move(schema); + cache.total_rows = total_rows; + cache.format = "parquet"; + cache.path = path; + return cache; +#endif +} + +SourceDataMetadata LoadSourceDataMetadata(const SourceDataSpec& source_spec) { + if (source_spec.format == "parquet") { + return LoadParquetSourceMetadata(source_spec.path); + } + throw std::runtime_error("unknown source format: " + source_spec.format); +} + +std::shared_ptr BuildStructArrayFromRecordBatch( + const std::shared_ptr& batch) { + return std::make_shared(arrow::struct_(batch->schema()->fields()), + batch->num_rows(), batch->columns()); +} + +std::unique_ptr MakeRecordBatch( + const std::shared_ptr& arr) { + ArrowArray c_array; + if (!arrow::ExportArray(*arr, &c_array).ok()) { + throw std::runtime_error("failed to export arrow array"); + } + paimon::RecordBatchBuilder builder(&c_array); + builder.SetBucket(0); + return ValueOrThrow(builder.Finish(), "build paimon record batch"); +} + +void EnsureTable(const std::string& root_path, const std::string& db_name, + const std::string& table_name, const std::map& options, + const std::shared_ptr& schema, + const std::vector& primary_keys = {}) { + auto catalog = ValueOrThrow(paimon::Catalog::Create(root_path, options), "create catalog"); + CheckStatus(catalog->CreateDatabase(db_name, options, true), "create database"); + + ArrowSchema c_schema; + if (!arrow::ExportSchema(*schema, &c_schema).ok()) { + throw std::runtime_error("failed to export table schema"); + } + CheckStatus(catalog->CreateTable(paimon::Identifier(db_name, table_name), &c_schema, + /*partition_keys=*/{}, primary_keys, options, + /*ignore_if_exists=*/false), + "create table"); +} + +void WriteSourceDataToWriter(paimon::FileStoreWrite* writer, const SourceDataSpec& source_spec) { + if (source_spec.format != "parquet") { + throw std::runtime_error("unknown source format: " + source_spec.format); + } + +#if !PAIMON_BENCHMARK_HAS_PARQUET_READER + throw std::runtime_error( + "Parquet source data mode requires parquet::arrow reader support in this build"); +#else + auto parquet_reader = OpenParquetSourceReader(source_spec.path); + std::unique_ptr batch_reader; + const auto reader_status = parquet_reader->GetRecordBatchReader(&batch_reader); + if (!reader_status.ok()) { + throw std::runtime_error("create Parquet source batch reader failed: " + + reader_status.ToString()); + } + + int64_t written_rows = 0; + while (true) { + std::shared_ptr record_batch; + const auto read_status = batch_reader->ReadNext(&record_batch); + if (!read_status.ok()) { + throw std::runtime_error("read Parquet source batch failed: " + read_status.ToString()); + } + if (record_batch == nullptr) { + break; + } + if (record_batch->num_rows() <= 0) { + continue; + } + + auto struct_array = BuildStructArrayFromRecordBatch(record_batch); + auto batch = MakeRecordBatch(struct_array); + CheckStatus(writer->Write(std::move(batch)), "write batch"); + written_rows += record_batch->num_rows(); + } + + if (written_rows <= 0) { + throw std::runtime_error("source file has no non-empty data batches: " + source_spec.path); + } +#endif +} + +void WriteAndCommit(const std::string& table_path, + const std::map& options, + const SourceDataSpec& source_spec) { + paimon::WriteContextBuilder write_builder(table_path, "benchmark-writer"); + auto write_ctx = + ValueOrThrow(write_builder.SetOptions(options).Finish(), "create write context"); + auto writer = ValueOrThrow(paimon::FileStoreWrite::Create(std::move(write_ctx)), + "create file store writer"); + + WriteSourceDataToWriter(writer.get(), source_spec); + auto messages = ValueOrThrow(writer->PrepareCommit(), "prepare commit"); + + paimon::CommitContextBuilder commit_builder(table_path, "benchmark-writer"); + auto commit_ctx = + ValueOrThrow(commit_builder.SetOptions(options).Finish(), "create commit context"); + auto committer = + ValueOrThrow(paimon::FileStoreCommit::Create(std::move(commit_ctx)), "create committer"); + CheckStatus(committer->Commit(messages), "commit write"); +} + +struct SharedReadTableCache { + std::string key; + std::unique_ptr workspace; + std::string table_path; + int64_t total_rows = 0; +}; + +struct SharedMorReadTableCache { + std::string key; + std::unique_ptr workspace; + std::string table_path; + int64_t total_rows = 0; +}; + +std::string BuildReadTableCacheKey(const std::string& file_format, + const SourceDataSpec& source_spec) { + return file_format + "|" + source_spec.format + "|" + source_spec.path + "|" + + std::to_string(GetSourceBatchMaxRows()); +} + +std::string JoinColumns(const std::vector& columns) { + std::string joined; + for (size_t i = 0; i < columns.size(); ++i) { + if (i > 0) { + joined.append(","); + } + joined.append(columns[i]); + } + return joined; +} + +const SharedMorReadTableCache& GetOrCreateSharedMorReadTable(const std::string& file_format, + const SourceDataSpec& source_spec) { + static SharedMorReadTableCache cache; + static std::mutex cache_mutex; + + const std::vector& pk_columns = GetPkColumns(); + const std::string cache_key = + BuildReadTableCacheKey(file_format, source_spec) + "|pk=" + JoinColumns(pk_columns); + std::lock_guard lock(cache_mutex); + if (cache.workspace != nullptr && cache.key == cache_key) { + std::cout << "[benchmark][mor-read] reuse_output_table_path=" << cache.table_path + << std::endl; + return cache; + } + + auto options = BuildPkOptions(file_format); + const auto source_metadata = LoadSourceDataMetadata(source_spec); + + auto workspace = CreateBenchmarkWorkspace(); + const std::string db_name = "bench_db"; + const std::string table_name = "mor_read_shared_" + std::to_string(NextTableId()); + EnsureTable(workspace->Str(), db_name, table_name, options, source_metadata.schema, + /*primary_keys=*/pk_columns); + const std::string table_path = RequirePath(workspace->Str(), db_name, table_name); + std::cout << "[benchmark][mor-read] create_shared_output_table_path=" << table_path + << std::endl; + WriteAndCommit(table_path, options, source_spec); + + cache.key = cache_key; + cache.workspace = std::move(workspace); + cache.table_path = table_path; + cache.total_rows = source_metadata.total_rows; + return cache; +} + +const SharedReadTableCache& GetOrCreateSharedReadTable(const std::string& file_format, + const SourceDataSpec& source_spec) { + static SharedReadTableCache cache; + static std::mutex cache_mutex; + + const std::string cache_key = BuildReadTableCacheKey(file_format, source_spec); + std::lock_guard lock(cache_mutex); + if (cache.workspace != nullptr && cache.key == cache_key) { + std::cout << "[benchmark][read] reuse_output_table_path=" << cache.table_path << std::endl; + return cache; + } + + auto options = BuildOptions(file_format); + const auto source_metadata = LoadSourceDataMetadata(source_spec); + + auto workspace = CreateBenchmarkWorkspace(); + const std::string db_name = "bench_db"; + const std::string table_name = "read_shared_" + std::to_string(NextTableId()); + EnsureTable(workspace->Str(), db_name, table_name, options, source_metadata.schema); + const std::string table_path = RequirePath(workspace->Str(), db_name, table_name); + std::cout << "[benchmark][read] create_shared_output_table_path=" << table_path << std::endl; + WriteAndCommit(table_path, options, source_spec); + + cache.key = cache_key; + cache.workspace = std::move(workspace); + cache.table_path = table_path; + cache.total_rows = source_metadata.total_rows; + return cache; +} + +int64_t ReadRows(const std::string& table_path, const std::map& options, + int32_t prefetch_parallel_num) { + paimon::ScanContextBuilder scan_builder(table_path); + auto scan_ctx = ValueOrThrow(scan_builder.SetOptions(options).Finish(), "create scan context"); + auto scanner = ValueOrThrow(paimon::TableScan::Create(std::move(scan_ctx)), "create scanner"); + auto plan = ValueOrThrow(scanner->CreatePlan(), "create plan"); + + paimon::ReadContextBuilder read_builder(table_path); + constexpr int32_t kPrefetchBatchCount = 600; + read_builder.SetOptions(options) + .EnablePrefetch(true) + .SetPrefetchBatchCount(kPrefetchBatchCount) + .SetPrefetchMaxParallelNum(prefetch_parallel_num) + .EnableMultiThreadRowToBatch(GetRowToBatchThreadNumber() > 1) + .SetRowToBatchThreadNumber(GetRowToBatchThreadNumber()); + auto read_ctx = ValueOrThrow(read_builder.Finish(), "create read context"); + auto reader = + ValueOrThrow(paimon::TableRead::Create(std::move(read_ctx)), "create table reader"); + auto batch_reader = ValueOrThrow(reader->CreateReader(plan->Splits()), "create batch reader"); + + int64_t total_rows = 0; + while (true) { + auto batch = ValueOrThrow(batch_reader->NextBatch(), "read next batch"); + if (paimon::BatchReader::IsEofBatch(batch)) { + break; + } + auto& [array, schema] = batch; + auto imported = arrow::ImportArray(array.get(), schema.get()); + if (!imported.ok()) { + throw std::runtime_error("import c data array failed: " + imported.status().ToString()); + } + total_rows += imported.ValueUnsafe()->length(); + } + + return total_rows; +} + +struct PreparedSourceData { + std::shared_ptr schema; + int64_t total_rows = 0; +}; + +bool TryGetSourceSpec(::benchmark::State& state, SourceDataSpec* source_spec) { + try { + *source_spec = GetSourceDataSpec(); + return true; + } catch (const std::exception& e) { + SkipWithMessage(state, e.what()); + return false; + } +} + +bool TryPrepareSourceData(::benchmark::State& state, const SourceDataSpec& source_spec, + PreparedSourceData* prepared) { + try { + const auto source_metadata = LoadSourceDataMetadata(source_spec); + prepared->schema = source_metadata.schema; + prepared->total_rows = source_metadata.total_rows; + return true; + } catch (const std::exception& e) { + SkipWithMessage(state, e.what()); + return false; + } +} + +} // namespace + +void ParsePaimonBenchmarkCliArgs(int* argc, char** argv) { + auto parsed_argc = static_cast(*argc); + ParsePaimonBenchmarkCliArgsImpl(&parsed_argc, argv); + *argc = static_cast(parsed_argc); +} + +bool HasHelpFlag(int32_t argc, char** argv) { + return HasHelpFlagImpl(argc, argv); +} + +void PrintPaimonBenchmarkCliHelp() { + PrintPaimonBenchmarkCliHelpImpl(); +} + +void RunBMWrite(::benchmark::State& state) { + const std::string file_format = GetConfiguredFileFormat(); + SourceDataSpec source_spec; + if (!TryGetSourceSpec(state, &source_spec)) { + return; + } + if (!BenchmarkHelpers::ValidateSourcePresenceOrSkip( + state, source_spec.path, "--paimon_source_data_file is required", &SkipWithMessage)) { + return; + } + if (!BenchmarkHelpers::ValidateSourceSupportOrSkip(state, source_spec.format, + SupportsSourceDataMode(source_spec.format), + &SkipWithMessage)) { + return; + } + if (!BenchmarkHelpers::ValidateFileFormatOrSkip( + state, file_format, IsFileFormatSupported(file_format), &SkipWithMessage)) { + return; + } + + auto options = BuildOptions(file_format); + PreparedSourceData prepared; + if (!TryPrepareSourceData(state, source_spec, &prepared)) { + return; + } + auto workspace = CreateBenchmarkWorkspace(); + + for (auto _ : state) { + const std::string db_name = "bench_db"; + const std::string table_name = "write_" + std::to_string(NextTableId()); + EnsureTable(workspace->Str(), db_name, table_name, options, prepared.schema); + const std::string table_path = RequirePath(workspace->Str(), db_name, table_name); + std::cout << "[benchmark][write] output_table_path=" << table_path << std::endl; + WriteAndCommit(table_path, options, source_spec); + } + + state.SetItemsProcessed(state.iterations() * prepared.total_rows); +} + +void RunBMRead(::benchmark::State& state) { + const auto prefetch_parallel_num = static_cast(state.range(0)); + const std::string file_format = GetConfiguredFileFormat(); + const std::string external_table_path = GetExternalTablePath(); + SourceDataSpec source_spec; + if (!TryGetSourceSpec(state, &source_spec)) { + return; + } + if (!BenchmarkHelpers::ValidateFileFormatOrSkip( + state, file_format, IsFileFormatSupported(file_format), &SkipWithMessage)) { + return; + } + + if (!BenchmarkHelpers::ValidatePrefetchParallelOrSkip(state, prefetch_parallel_num, + &SkipWithMessage)) { + return; + } + + auto options = BuildOptions(file_format); + + if (BenchmarkHelpers::TryRunExternalReadMode(state, "read", external_table_path, [&]() { + return ReadRows(external_table_path, options, prefetch_parallel_num); + })) { + return; + } + + if (!BenchmarkHelpers::ValidateSourcePresenceOrSkip( + state, source_spec.path, + "--paimon_source_data_file is required when --paimon_external_table_path is not set", + &SkipWithMessage)) { + return; + } + if (!BenchmarkHelpers::ValidateSourceSupportOrSkip(state, source_spec.format, + SupportsSourceDataMode(source_spec.format), + &SkipWithMessage)) { + return; + } + + const SharedReadTableCache* shared_table = nullptr; + try { + shared_table = &GetOrCreateSharedReadTable(file_format, source_spec); + } catch (const std::exception& e) { + SkipWithMessage(state, e.what()); + return; + } + + const int64_t rows_read = BenchmarkHelpers::RunReadIterations(state, [&]() { + return ReadRows(shared_table->table_path, options, prefetch_parallel_num); + }); + + state.SetItemsProcessed(state.iterations() * rows_read); +} + +void RunBMPkWrite(::benchmark::State& state) { + const std::string file_format = GetConfiguredFileFormat(); + SourceDataSpec source_spec; + if (!TryGetSourceSpec(state, &source_spec)) { + return; + } + if (!BenchmarkHelpers::ValidateSourcePresenceOrSkip( + state, source_spec.path, "--paimon_source_data_file is required", &SkipWithMessage)) { + return; + } + if (!BenchmarkHelpers::ValidateSourceSupportOrSkip(state, source_spec.format, + SupportsSourceDataMode(source_spec.format), + &SkipWithMessage)) { + return; + } + if (!BenchmarkHelpers::ValidateFileFormatOrSkip( + state, file_format, IsFileFormatSupported(file_format), &SkipWithMessage)) { + return; + } + const std::vector& pk_columns = GetPkColumns(); + if (pk_columns.empty()) { + SkipWithMessage(state, "--paimon_pk_columns is required for BM_PK_Write"); + return; + } + + auto options = BuildPkOptions(file_format); + PreparedSourceData prepared; + if (!TryPrepareSourceData(state, source_spec, &prepared)) { + return; + } + auto workspace = CreateBenchmarkWorkspace(); + + for (auto _ : state) { + const std::string db_name = "bench_db"; + const std::string table_name = "pk_write_" + std::to_string(NextTableId()); + EnsureTable(workspace->Str(), db_name, table_name, options, prepared.schema, + /*primary_keys=*/pk_columns); + const std::string table_path = RequirePath(workspace->Str(), db_name, table_name); + std::cout << "[benchmark][pk-write] output_table_path=" << table_path << std::endl; + WriteAndCommit(table_path, options, source_spec); + } + + state.SetItemsProcessed(state.iterations() * prepared.total_rows); +} + +void RunBMMorRead(::benchmark::State& state) { + const auto prefetch_parallel_num = static_cast(state.range(0)); + const std::string file_format = GetConfiguredFileFormat(); + const std::string external_table_path = GetExternalTablePath(); + SourceDataSpec source_spec; + if (!TryGetSourceSpec(state, &source_spec)) { + return; + } + if (!BenchmarkHelpers::ValidateFileFormatOrSkip( + state, file_format, IsFileFormatSupported(file_format), &SkipWithMessage)) { + return; + } + if (!BenchmarkHelpers::ValidatePrefetchParallelOrSkip(state, prefetch_parallel_num, + &SkipWithMessage)) { + return; + } + + const auto external_read_options = BuildOptions(file_format); + if (BenchmarkHelpers::TryRunExternalReadMode(state, "mor-read", external_table_path, [&]() { + return ReadRows(external_table_path, external_read_options, prefetch_parallel_num); + })) { + return; + } + + if (!BenchmarkHelpers::ValidateSourcePresenceOrSkip( + state, source_spec.path, + "--paimon_source_data_file is required when --paimon_external_table_path is not set", + &SkipWithMessage)) { + return; + } + if (!BenchmarkHelpers::ValidateSourceSupportOrSkip(state, source_spec.format, + SupportsSourceDataMode(source_spec.format), + &SkipWithMessage)) { + return; + } + if (GetPkColumns().empty()) { + SkipWithMessage(state, "--paimon_pk_columns is required for BM_MOR_Read"); + return; + } + + auto options = BuildPkOptions(file_format); + const SharedMorReadTableCache* shared_table = nullptr; + try { + shared_table = &GetOrCreateSharedMorReadTable(file_format, source_spec); + } catch (const std::exception& e) { + SkipWithMessage(state, e.what()); + return; + } + + const int64_t rows_read = BenchmarkHelpers::RunReadIterations(state, [&]() { + return ReadRows(shared_table->table_path, options, prefetch_parallel_num); + }); + state.SetItemsProcessed(state.iterations() * rows_read); +} + +} // namespace paimon::benchmark diff --git a/benchmark/benchmark_suite.h b/benchmark/benchmark_suite.h new file mode 100644 index 000000000..43c07af7f --- /dev/null +++ b/benchmark/benchmark_suite.h @@ -0,0 +1,34 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "benchmark/benchmark.h" + +namespace paimon::benchmark { + +void ParsePaimonBenchmarkCliArgs(int* argc, char** argv); +bool HasHelpFlag(int32_t argc, char** argv); +void PrintPaimonBenchmarkCliHelp(); + +void RunBMWrite(::benchmark::State& state); +void RunBMRead(::benchmark::State& state); +void RunBMPkWrite(::benchmark::State& state); +void RunBMMorRead(::benchmark::State& state); + +} // namespace paimon::benchmark diff --git a/benchmark/cli_option_parsing.h b/benchmark/cli_option_parsing.h new file mode 100644 index 000000000..448ce5ec5 --- /dev/null +++ b/benchmark/cli_option_parsing.h @@ -0,0 +1,164 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace paimon::benchmark { + +inline bool ConsumeCliOption(const std::string& arg, const std::string& option_name, + std::string* value_out) { + const std::string prefix = option_name + "="; + if (arg.rfind(prefix, 0) != 0) { + return false; + } + *value_out = arg.substr(prefix.size()); + return true; +} + +inline std::string TrimAsciiWhitespace(const std::string& value) { + const auto first = value.find_first_not_of(" \t\n\r"); + if (first == std::string::npos) { + return ""; + } + const auto last = value.find_last_not_of(" \t\n\r"); + return value.substr(first, last - first + 1); +} + +inline std::vector ParseCommaSeparatedColumns(const std::string& input, + const std::string& option_name) { + if (input.empty()) { + throw std::runtime_error("missing value for " + option_name); + } + + std::vector columns; + size_t segment_start = 0; + for (size_t index = 0; index <= input.size(); ++index) { + if (index != input.size() && input[index] != ',') { + continue; + } + + const std::string column = + TrimAsciiWhitespace(input.substr(segment_start, index - segment_start)); + if (column.empty()) { + throw std::runtime_error("invalid " + option_name + ": empty column name"); + } + columns.push_back(column); + segment_start = index + 1; + } + return columns; +} + +inline std::vector> ParseDelimitedOptions( + const std::string& input, const std::string& option_name) { + if (input.empty()) { + throw std::runtime_error("missing value for " + option_name); + } + + std::vector> parsed; + std::string token; + for (size_t index = 0; index <= input.size(); ++index) { + const bool at_end = (index == input.size()); + if (!at_end && input[index] != ';') { + token.push_back(input[index]); + continue; + } + + if (token.empty()) { + throw std::runtime_error("invalid " + option_name + ": empty option segment"); + } + + const auto separator = token.find(':'); + if (separator == std::string::npos || separator == 0 || separator + 1 >= token.size()) { + throw std::runtime_error("invalid " + option_name + ": expected key:value"); + } + + parsed.emplace_back(token.substr(0, separator), token.substr(separator + 1)); + token.clear(); + } + return parsed; +} + +inline bool ParseStringOptionArg(int32_t argc, char** argv, const std::string& arg, + const std::string& option_name, int32_t* arg_index, + std::string* value_out) { + std::string parsed_value; + if (ConsumeCliOption(arg, option_name, &parsed_value)) { + *value_out = std::move(parsed_value); + return true; + } + + if (arg != option_name) { + return false; + } + + if (*arg_index + 1 >= argc) { + throw std::runtime_error("missing value for " + option_name); + } + *value_out = argv[++(*arg_index)]; + return true; +} + +inline bool ParseCommaSeparatedOptionArg(int32_t argc, char** argv, const std::string& arg, + const std::string& option_name, int32_t* arg_index, + std::vector* columns_out) { + std::string parsed_value; + if (ConsumeCliOption(arg, option_name, &parsed_value)) { + *columns_out = ParseCommaSeparatedColumns(parsed_value, option_name); + return true; + } + + if (arg != option_name) { + return false; + } + + if (*arg_index + 1 >= argc) { + throw std::runtime_error("missing value for " + option_name); + } + *columns_out = ParseCommaSeparatedColumns(std::string(argv[++(*arg_index)]), option_name); + return true; +} + +inline bool ParseDelimitedRepeatableOptionArg( + int32_t argc, char** argv, const std::string& arg, const std::string& option_name, + int32_t* arg_index, std::vector>* options_out) { + std::string parsed_value; + if (ConsumeCliOption(arg, option_name, &parsed_value)) { + const auto parsed_options = ParseDelimitedOptions(parsed_value, option_name); + options_out->insert(options_out->end(), parsed_options.begin(), parsed_options.end()); + return true; + } + + if (arg != option_name) { + return false; + } + + if (*arg_index + 1 >= argc) { + throw std::runtime_error("missing value for " + option_name); + } + + const std::string option_arg = argv[++(*arg_index)]; + const auto parsed_options = ParseDelimitedOptions(option_arg, option_name); + options_out->insert(options_out->end(), parsed_options.begin(), parsed_options.end()); + return true; +} + +} // namespace paimon::benchmark diff --git a/benchmark/cli_option_parsing_test.cpp b/benchmark/cli_option_parsing_test.cpp new file mode 100644 index 000000000..7c33cdf39 --- /dev/null +++ b/benchmark/cli_option_parsing_test.cpp @@ -0,0 +1,148 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark/cli_option_parsing.h" + +#include +#include +#include +#include + +#include "gtest/gtest.h" + +namespace paimon::testing { +namespace { + +struct ArgvHolder { + std::vector args; + std::vector argv; + + explicit ArgvHolder(std::vector in_args) : args(std::move(in_args)) { + argv.reserve(args.size()); + for (auto& arg : args) { + argv.push_back(arg.data()); + } + } + + int32_t argc() const { + return static_cast(argv.size()); + } +}; + +TEST(CliOptionParsingTest, ConsumeCliOptionWorks) { + std::string value; + ASSERT_TRUE(paimon::benchmark::ConsumeCliOption("--foo=bar", "--foo", &value)); + ASSERT_EQ(value, "bar"); + + value.clear(); + ASSERT_FALSE(paimon::benchmark::ConsumeCliOption("--foo", "--foo", &value)); +} + +TEST(CliOptionParsingTest, ParseCommaSeparatedColumnsWorks) { + const auto parsed = paimon::benchmark::ParseCommaSeparatedColumns("id, name,age", "--cols"); + ASSERT_EQ(parsed.size(), 3U); + ASSERT_EQ(parsed[0], "id"); + ASSERT_EQ(parsed[1], "name"); + ASSERT_EQ(parsed[2], "age"); +} + +TEST(CliOptionParsingTest, ParseCommaSeparatedColumnsRejectsInvalidInput) { + ASSERT_THROW((void)paimon::benchmark::ParseCommaSeparatedColumns("", "--cols"), + std::runtime_error); + ASSERT_THROW((void)paimon::benchmark::ParseCommaSeparatedColumns("id,", "--cols"), + std::runtime_error); + ASSERT_THROW((void)paimon::benchmark::ParseCommaSeparatedColumns("id,,name", "--cols"), + std::runtime_error); +} + +TEST(CliOptionParsingTest, ParseDelimitedOptionsWorks) { + const auto parsed = paimon::benchmark::ParseDelimitedOptions("k1:v1;k2:v2", "--paimon_option"); + ASSERT_EQ(parsed.size(), 2U); + ASSERT_EQ(parsed[0], std::make_pair(std::string("k1"), std::string("v1"))); + ASSERT_EQ(parsed[1], std::make_pair(std::string("k2"), std::string("v2"))); +} + +TEST(CliOptionParsingTest, ParseDelimitedOptionsRejectsInvalidInput) { + ASSERT_THROW((void)paimon::benchmark::ParseDelimitedOptions("", "--paimon_option"), + std::runtime_error); + ASSERT_THROW((void)paimon::benchmark::ParseDelimitedOptions("k1:v1;", "--paimon_option"), + std::runtime_error); +} + +TEST(CliOptionParsingTest, ParseStringOptionArgWorksForEqualsAndSeparatedForms) { + { + ArgvHolder argv_holder({"prog", "--foo=bar"}); + int32_t arg_index = 1; + std::string value; + ASSERT_TRUE(paimon::benchmark::ParseStringOptionArg( + argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], "--foo", + &arg_index, &value)); + ASSERT_EQ(arg_index, 1); + ASSERT_EQ(value, "bar"); + } + + { + ArgvHolder argv_holder({"prog", "--foo", "bar"}); + int32_t arg_index = 1; + std::string value; + ASSERT_TRUE(paimon::benchmark::ParseStringOptionArg( + argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], "--foo", + &arg_index, &value)); + ASSERT_EQ(arg_index, 2); + ASSERT_EQ(value, "bar"); + } +} + +TEST(CliOptionParsingTest, ParseStringOptionArgRejectsMissingValue) { + ArgvHolder argv_holder({"prog", "--foo"}); + int32_t arg_index = 1; + std::string value; + ASSERT_THROW((void)paimon::benchmark::ParseStringOptionArg( + argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], + "--foo", &arg_index, &value), + std::runtime_error); +} + +TEST(CliOptionParsingTest, ParseCommaSeparatedOptionArgAndDelimitedRepeatableOptionArgWorks) { + { + ArgvHolder argv_holder({"prog", "--cols", "id,name"}); + int32_t arg_index = 1; + std::vector columns; + ASSERT_TRUE(paimon::benchmark::ParseCommaSeparatedOptionArg( + argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], "--cols", + &arg_index, &columns)); + ASSERT_EQ(arg_index, 2); + ASSERT_EQ(columns.size(), 2U); + ASSERT_EQ(columns[0], "id"); + ASSERT_EQ(columns[1], "name"); + } + + { + ArgvHolder argv_holder({"prog", "--paimon_option", "k1:v1;k2:v2"}); + int32_t arg_index = 1; + std::vector> options; + ASSERT_TRUE(paimon::benchmark::ParseDelimitedRepeatableOptionArg( + argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], + "--paimon_option", &arg_index, &options)); + ASSERT_EQ(arg_index, 2); + ASSERT_EQ(options.size(), 2U); + ASSERT_EQ(options[0], std::make_pair(std::string("k1"), std::string("v1"))); + ASSERT_EQ(options[1], std::make_pair(std::string("k2"), std::string("v2"))); + } +} + +} // namespace +} // namespace paimon::testing diff --git a/benchmark/read_write_benchmark.cpp b/benchmark/read_write_benchmark.cpp new file mode 100644 index 000000000..0710cdc9b --- /dev/null +++ b/benchmark/read_write_benchmark.cpp @@ -0,0 +1,45 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "benchmark/benchmark.h" +#include "benchmark/benchmark_suite.h" + +int main(int argc, char** argv) { + try { + paimon::benchmark::ParsePaimonBenchmarkCliArgs(&argc, argv); + } catch (const std::exception& e) { + std::cerr << "paimon-read-write-benchmark: " << e.what() << std::endl; + std::cerr << "Try 'paimon-read-write-benchmark --help' for more information." << std::endl; + return 1; + } + + if (paimon::benchmark::HasHelpFlag(static_cast(argc), argv)) { + paimon::benchmark::PrintPaimonBenchmarkCliHelp(); + return 0; + } + + benchmark::Initialize(&argc, argv); + if (benchmark::ReportUnrecognizedArguments(argc, argv)) { + return 1; + } + benchmark::RunSpecifiedBenchmarks(); + benchmark::Shutdown(); + return 0; +} diff --git a/cmake_modules/BuildUtils.cmake b/cmake_modules/BuildUtils.cmake index ed27ed786..d6d3b4a58 100644 --- a/cmake_modules/BuildUtils.cmake +++ b/cmake_modules/BuildUtils.cmake @@ -404,3 +404,121 @@ function(add_paimon_test REL_TEST_NAME) ${PCH_ARGS} ${ARG_UNPARSED_ARGUMENTS}) endfunction() + +function(add_benchmark_case REL_BENCHMARK_NAME) + set(options ENABLED) + set(one_value_args) + set(multi_value_args + SOURCES + STATIC_LINK_LIBS + EXTRA_LINK_LIBS + EXTRA_INCLUDES + LABELS + PREFIX) + cmake_parse_arguments(ARG + "${options}" + "${one_value_args}" + "${multi_value_args}" + ${ARGN}) + if(ARG_UNPARSED_ARGUMENTS) + message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") + endif() + + if(NOT PAIMON_BUILD_BENCHMARKS AND NOT ARG_ENABLED) + return() + endif() + + get_filename_component(BENCHMARK_NAME ${REL_BENCHMARK_NAME} NAME_WE) + + if(ARG_PREFIX) + set(BENCHMARK_NAME "${ARG_PREFIX}-${BENCHMARK_NAME}") + endif() + + if(ARG_SOURCES) + set(SOURCES ${ARG_SOURCES}) + else() + set(SOURCES "${REL_BENCHMARK_NAME}.cpp") + endif() + + string(REPLACE "_" "-" BENCHMARK_NAME ${BENCHMARK_NAME}) + set(BENCHMARK_PATH "${EXECUTABLE_OUTPUT_PATH}/${BENCHMARK_NAME}") + message(STATUS ${BENCHMARK_NAME}) + add_executable(${BENCHMARK_NAME} ${SOURCES}) + + if(ARG_STATIC_LINK_LIBS) + target_link_libraries(${BENCHMARK_NAME} PRIVATE ${ARG_STATIC_LINK_LIBS}) + endif() + + if(ARG_EXTRA_LINK_LIBS) + target_link_libraries(${BENCHMARK_NAME} PRIVATE ${ARG_EXTRA_LINK_LIBS}) + endif() + + if(ARG_EXTRA_INCLUDES) + target_include_directories(${BENCHMARK_NAME} SYSTEM PUBLIC ${ARG_EXTRA_INCLUDES}) + endif() + + if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + target_compile_options(${BENCHMARK_NAME} PRIVATE -Wno-global-constructors) + endif() + target_compile_options(${BENCHMARK_NAME} PRIVATE -fno-access-control) + + add_test(${BENCHMARK_NAME} + ${BUILD_SUPPORT_DIR}/run-test.sh + ${CMAKE_BINARY_DIR} + benchmark + ${BENCHMARK_PATH}) + + foreach(TARGET ${ARG_LABELS}) + add_dependencies(${TARGET} ${BENCHMARK_NAME}) + endforeach() + + set(LABELS) + list(APPEND LABELS "benchmark") + if(ARG_LABELS) + list(APPEND LABELS ${ARG_LABELS}) + endif() + + foreach(LABEL ${ARG_LABELS}) + set(LABEL_BENCHMARK_NAME "benchmark-${LABEL}") + if(NOT TARGET ${LABEL_BENCHMARK_NAME}) + add_custom_target(${LABEL_BENCHMARK_NAME} + ctest -L "${LABEL}" --output-on-failure + USES_TERMINAL) + endif() + add_dependencies(${LABEL_BENCHMARK_NAME} ${BENCHMARK_NAME}) + endforeach() + + set_property(TEST ${BENCHMARK_NAME} + APPEND + PROPERTY LABELS ${LABELS}) +endfunction() + +function(add_paimon_benchmark REL_BENCHMARK_NAME) + set(options) + set(one_value_args PREFIX) + set(multi_value_args LABELS) + cmake_parse_arguments(ARG + "${options}" + "${one_value_args}" + "${multi_value_args}" + ${ARGN}) + + if(ARG_PREFIX) + set(PREFIX ${ARG_PREFIX}) + else() + set(PREFIX "paimon") + endif() + + if(ARG_LABELS) + set(LABELS ${ARG_LABELS}) + else() + set(LABELS "paimon-benchmarks") + endif() + + add_benchmark_case(${REL_BENCHMARK_NAME} + PREFIX + ${PREFIX} + LABELS + ${LABELS} + ${ARG_UNPARSED_ARGUMENTS}) +endfunction() diff --git a/cmake_modules/DefineOptions.cmake b/cmake_modules/DefineOptions.cmake index 41ff252c5..cf7f964be 100644 --- a/cmake_modules/DefineOptions.cmake +++ b/cmake_modules/DefineOptions.cmake @@ -107,6 +107,9 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") define_option(PAIMON_BUILD_TESTS "Build the Paimon googletest unit tests" OFF) + define_option(PAIMON_BUILD_BENCHMARKS + "Build the Paimon Google Benchmark performance benchmarks" OFF) + if(PAIMON_BUILD_SHARED) set(PAIMON_TEST_LINKAGE_DEFAULT "shared") else() @@ -241,6 +244,13 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") AUTO BUNDLED SYSTEM) + + define_option_string(benchmark_SOURCE + "Dependency source for Google Benchmark" + "" + AUTO + BUNDLED + SYSTEM) endif() macro(validate_config) diff --git a/cmake_modules/FindbenchmarkAlt.cmake b/cmake_modules/FindbenchmarkAlt.cmake new file mode 100644 index 000000000..a731f3359 --- /dev/null +++ b/cmake_modules/FindbenchmarkAlt.cmake @@ -0,0 +1,60 @@ +# Copyright 2026-present Alibaba Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set(_PAIMON_BENCHMARK_ROOTS ${benchmark_ROOT} ${BENCHMARK_ROOT} ${PAIMON_PACKAGE_PREFIX}) +list(REMOVE_ITEM _PAIMON_BENCHMARK_ROOTS "") +if(_PAIMON_BENCHMARK_ROOTS) + set(_PAIMON_BENCHMARK_FIND_ARGS HINTS ${_PAIMON_BENCHMARK_ROOTS} NO_DEFAULT_PATH) +endif() + +find_package(benchmark CONFIG QUIET ${_PAIMON_BENCHMARK_FIND_ARGS}) + +if(NOT TARGET benchmark::benchmark) + find_path(BENCHMARK_INCLUDE_DIR + NAMES benchmark/benchmark.h ${_PAIMON_BENCHMARK_FIND_ARGS} + PATH_SUFFIXES include) + find_library(BENCHMARK_LIBRARY + NAMES benchmark ${_PAIMON_BENCHMARK_FIND_ARGS} + PATH_SUFFIXES lib lib64) + find_library(BENCHMARK_MAIN_LIBRARY + NAMES benchmark_main ${_PAIMON_BENCHMARK_FIND_ARGS} + PATH_SUFFIXES lib lib64) + + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args(benchmarkAlt REQUIRED_VARS BENCHMARK_INCLUDE_DIR + BENCHMARK_LIBRARY) + + if(benchmarkAlt_FOUND) + if(NOT TARGET benchmark::benchmark) + add_library(benchmark::benchmark UNKNOWN IMPORTED) + set_target_properties(benchmark::benchmark + PROPERTIES IMPORTED_LOCATION "${BENCHMARK_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES + "${BENCHMARK_INCLUDE_DIR}") + endif() + + if(BENCHMARK_MAIN_LIBRARY AND NOT TARGET benchmark::benchmark_main) + add_library(benchmark::benchmark_main UNKNOWN IMPORTED) + set_target_properties(benchmark::benchmark_main + PROPERTIES IMPORTED_LOCATION "${BENCHMARK_MAIN_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES + "${BENCHMARK_INCLUDE_DIR}") + endif() + endif() +else() + set(benchmarkAlt_FOUND TRUE) +endif() + +unset(_PAIMON_BENCHMARK_ROOTS) +unset(_PAIMON_BENCHMARK_FIND_ARGS) diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake index 068cf7de7..271011a0d 100644 --- a/cmake_modules/ThirdpartyToolchain.cmake +++ b/cmake_modules/ThirdpartyToolchain.cmake @@ -245,6 +245,18 @@ else() endif() endif() +if(DEFINED ENV{PAIMON_BENCHMARK_URL}) + set(BENCHMARK_SOURCE_URL "$ENV{PAIMON_BENCHMARK_URL}") +else() + if(EXISTS "${THIRDPARTY_DIR}/${PAIMON_BENCHMARK_PKG_NAME}") + set_urls(BENCHMARK_SOURCE_URL "${THIRDPARTY_DIR}/${PAIMON_BENCHMARK_PKG_NAME}") + else() + set_urls(BENCHMARK_SOURCE_URL + "${THIRDPARTY_MIRROR_URL}https://github.com/google/benchmark/archive/refs/tags/v${PAIMON_BENCHMARK_BUILD_VERSION}.tar.gz" + ) + endif() +endif() + if(DEFINED ENV{PAIMON_TBB_URL}) set(TBB_SOURCE_URL "$ENV{PAIMON_TBB_URL}") else() @@ -500,6 +512,8 @@ function(paimon_get_dependency_compat_target DEPENDENCY_NAME OUT_VAR) set(_target libprotobuf) elseif("${DEPENDENCY_NAME}" STREQUAL "GTest") set(_target GTest::gtest) + elseif("${DEPENDENCY_NAME}" STREQUAL "benchmark") + set(_target benchmark::benchmark) elseif("${DEPENDENCY_NAME}" STREQUAL "RE2") set(_target re2::re2) elseif("${DEPENDENCY_NAME}" STREQUAL "Snappy") @@ -586,6 +600,8 @@ macro(paimon_build_dependency DEPENDENCY_NAME) build_avro() elseif("${DEPENDENCY_NAME}" STREQUAL "GTest") build_gtest() + elseif("${DEPENDENCY_NAME}" STREQUAL "benchmark") + build_benchmark() else() message(FATAL_ERROR "No bundled build rule for ${DEPENDENCY_NAME}") endif() @@ -1743,6 +1759,49 @@ macro(build_tbb) endmacro(build_tbb) +macro(build_benchmark) + message(STATUS "Building benchmark from source") + + set(BENCHMARK_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/benchmark_ep-install") + set(BENCHMARK_INCLUDE_DIR "${BENCHMARK_PREFIX}/include") + set(BENCHMARK_STATIC_LIB + "${BENCHMARK_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}benchmark${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + set(BENCHMARK_MAIN_STATIC_LIB + "${BENCHMARK_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}benchmark_main${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + + set(BENCHMARK_CMAKE_ARGS + ${EP_COMMON_CMAKE_ARGS} + "-DCMAKE_INSTALL_PREFIX=${BENCHMARK_PREFIX}" + -DBENCHMARK_ENABLE_TESTING=OFF + -DBENCHMARK_ENABLE_GTEST_TESTS=OFF + -DBENCHMARK_DOWNLOAD_DEPENDENCIES=OFF) + + externalproject_add(benchmark_ep + URL ${BENCHMARK_SOURCE_URL} + URL_HASH "SHA256=${PAIMON_BENCHMARK_BUILD_SHA256_CHECKSUM}" + CMAKE_ARGS ${BENCHMARK_CMAKE_ARGS} + BUILD_BYPRODUCTS "${BENCHMARK_STATIC_LIB}" + "${BENCHMARK_MAIN_STATIC_LIB}") + + file(MAKE_DIRECTORY "${BENCHMARK_INCLUDE_DIR}") + + add_library(benchmark::benchmark STATIC IMPORTED) + set_target_properties(benchmark::benchmark + PROPERTIES IMPORTED_LOCATION "${BENCHMARK_STATIC_LIB}" + INTERFACE_INCLUDE_DIRECTORIES + "${BENCHMARK_INCLUDE_DIR}") + add_dependencies(benchmark::benchmark benchmark_ep) + + add_library(benchmark::benchmark_main STATIC IMPORTED) + set_target_properties(benchmark::benchmark_main + PROPERTIES IMPORTED_LOCATION "${BENCHMARK_MAIN_STATIC_LIB}" + INTERFACE_INCLUDE_DIRECTORIES + "${BENCHMARK_INCLUDE_DIR}") + add_dependencies(benchmark::benchmark_main benchmark_ep) +endmacro() + macro(build_glog) message(STATUS "Building glog from source") set(GLOG_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/glog_ep-install") @@ -1810,6 +1869,9 @@ if(PAIMON_ENABLE_ORC) resolve_dependency(Protobuf) resolve_dependency(ORC) endif() +if(PAIMON_BUILD_BENCHMARKS) + resolve_dependency(benchmark) +endif() if(PAIMON_ENABLE_JINDO) build_jindosdk_c() build_jindosdk_nextarch() diff --git a/docs/source/examples/benchmark.rst b/docs/source/examples/benchmark.rst new file mode 100644 index 000000000..7ae4e272f --- /dev/null +++ b/docs/source/examples/benchmark.rst @@ -0,0 +1,92 @@ +.. Copyright 2026-present Alibaba Inc. + +.. Licensed under the Apache License, Version 2.0 (the "License"); +.. you may not use this file except in compliance with the License. +.. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, software +.. distributed under the License is distributed on an "AS IS" BASIS, +.. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +.. See the License for the specific language governing permissions and +.. limitations under the License. + +================ +Benchmark Usage +================ + +Paimon C++ provides Google Benchmark based cases for append-table write/read and +primary-key table write/MOR read paths. Benchmarks are disabled by default. + +Build +===== + +Enable benchmarks when configuring CMake:: + + cmake -S . -B build -DPAIMON_BUILD_BENCHMARKS=ON + cmake --build build --target paimon-read-write-benchmark + +Run all benchmark cases through CTest:: + + cmake --build build --target benchmark + +Custom Options +============== + +``paimon-read-write-benchmark`` accepts Google Benchmark options plus the Paimon +specific options below: + +``--paimon_source_data_file=`` + Source data file used to build benchmark data. Currently Parquet source files + are supported. + +``--paimon_external_table_path=`` + Read directly from an existing table path for ``BM_Read`` and ``BM_MOR_Read``. + When set, the source loading and pre-write stage are skipped. + +``--paimon_file_format=`` + Target table file format. The default value is ``parquet``. + +``--paimon_source_batch_max_rows=`` + Max rows per source batch. The default value is ``4096``. + +``--paimon_row_to_batch_thread_number=`` + Row-to-batch thread number for reads. The default value is ``3``. + +``--paimon_pk_columns=`` + Primary key columns for ``BM_PK_Write`` and ``BM_MOR_Read``. These cases + explicitly use ``bucket=1`` because benchmark batches are written to bucket 0. + +``--paimon_option=:;:`` + Repeatable table options passed through to Paimon. For ``BM_PK_Write`` and + ``BM_MOR_Read``, ``bucket`` is forced to ``1``. + +Examples +======== + +Append table write:: + + paimon-read-write-benchmark \ + --paimon_source_data_file /path/data.parquet \ + --benchmark_filter=BM_Write + +Append table read with four prefetch workers:: + + paimon-read-write-benchmark \ + --paimon_source_data_file /path/data.parquet \ + --benchmark_filter=BM_Read/4 + +Primary-key table write:: + + paimon-read-write-benchmark \ + --paimon_source_data_file /path/data.parquet \ + --paimon_pk_columns=id \ + --benchmark_filter=BM_PK_Write + +MOR read from an existing table:: + + paimon-read-write-benchmark \ + --paimon_external_table_path /path/table \ + --paimon_pk_columns=id \ + --benchmark_filter=BM_MOR_Read/4 diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst index 0ba3b318a..b3ec8c536 100644 --- a/docs/source/examples/index.rst +++ b/docs/source/examples/index.rst @@ -20,3 +20,4 @@ Examples write_commit_scan_read clean + benchmark diff --git a/src/paimon/testing/utils/CMakeLists.txt b/src/paimon/testing/utils/CMakeLists.txt index b8fe36296..ee18db9ea 100644 --- a/src/paimon/testing/utils/CMakeLists.txt +++ b/src/paimon/testing/utils/CMakeLists.txt @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -if(PAIMON_BUILD_TESTS) +if(PAIMON_BUILD_TESTS OR PAIMON_BUILD_BENCHMARKS) set(PAIMON_TEST_UTILS testharness.cpp data_generator.cpp) @@ -25,6 +25,9 @@ if(PAIMON_BUILD_TESTS) STATIC_LINK_LIBS paimon_static ${GTEST_LINK_TOOLCHAIN}) +endif() + +if(PAIMON_BUILD_TESTS) add_paimon_test(test_utils_test SOURCES diff --git a/third_party/versions.txt b/third_party/versions.txt index c27599997..89624f5c0 100644 --- a/third_party/versions.txt +++ b/third_party/versions.txt @@ -60,6 +60,10 @@ PAIMON_GTEST_BUILD_VERSION=1.11.0 PAIMON_GTEST_BUILD_SHA256_CHECKSUM=b4870bf121ff7795ba20d20bcdd8627b8e088f2d1dab299a031c1034eddc93d5 PAIMON_GTEST_PKG_NAME=gtest-${PAIMON_GTEST_BUILD_VERSION}.tar.gz +PAIMON_BENCHMARK_BUILD_VERSION=1.9.1 +PAIMON_BENCHMARK_BUILD_SHA256_CHECKSUM=32131c08ee31eeff2c8968d7e874f3cb648034377dfc32a4c377fa8796d84981 +PAIMON_BENCHMARK_PKG_NAME=benchmark-${PAIMON_BENCHMARK_BUILD_VERSION}.tar.gz + PAIMON_ARROW_BUILD_VERSION=17.0.0 PAIMON_ARROW_BUILD_SHA256_CHECKSUM=9d280d8042e7cf526f8c28d170d93bfab65e50f94569f6a790982a878d8d898d PAIMON_ARROW_PKG_NAME=apache-arrow-${PAIMON_ARROW_BUILD_VERSION}.tar.gz @@ -124,6 +128,7 @@ DEPENDENCIES=( "PAIMON_TBB_URL ${PAIMON_TBB_PKG_NAME} ${THIRDPARTY_MIRROR_URL}https://github.com/uxlfoundation/oneTBB/archive/refs/tags/${PAIMON_TBB_BUILD_VERSION}.tar.gz" "PAIMON_ORC_URL ${PAIMON_ORC_PKG_NAME} ${THIRDPARTY_MIRROR_URL}https://github.com/apache/orc/archive/refs/tags/${PAIMON_ORC_BUILD_VERSION}.tar.gz" "PAIMON_GTEST_URL ${PAIMON_GTEST_PKG_NAME} ${THIRDPARTY_MIRROR_URL}https://github.com/google/googletest/archive/release-${PAIMON_GTEST_BUILD_VERSION}.tar.gz" + "PAIMON_BENCHMARK_URL ${PAIMON_BENCHMARK_PKG_NAME} ${THIRDPARTY_MIRROR_URL}https://github.com/google/benchmark/archive/refs/tags/v${PAIMON_BENCHMARK_BUILD_VERSION}.tar.gz" "PAIMON_ARROW_URL ${PAIMON_ARROW_PKG_NAME} ${THIRDPARTY_MIRROR_URL}https://github.com/apache/arrow/releases/download/apache-arrow-${PAIMON_ARROW_BUILD_VERSION}/apache-arrow-${PAIMON_ARROW_BUILD_VERSION}.tar.gz" "PAIMON_AVRO_URL ${PAIMON_AVRO_PKG_NAME} ${THIRDPARTY_MIRROR_URL}https://github.com/apache/avro/archive/${PAIMON_AVRO_BUILD_VERSION}.tar.gz" "PAIMON_FMT_URL ${PAIMON_FMT_PKG_NAME} ${THIRDPARTY_MIRROR_URL}https://github.com/fmtlib/fmt/archive/refs/tags/${PAIMON_FMT_BUILD_VERSION}.tar.gz"