diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ce9e44..00c24f4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,6 +96,7 @@ message(STATUS " WITH_LOGGING = ${WITH_LOGGING}") message(STATUS " SAMPLING_TEST_MODE = ${SAMPLING_TEST_MODE}") message(STATUS " ENABLE_FOR_SCHEME = ${ENABLE_FOR_SCHEME}") message(STATUS " BUILD_SHARED_LIBRARY = ${BUILD_SHARED_LIBRARY}") +message(STATUS " BTRBLOCKS_CHUNKSIZE = ${BTRBLOCKS_CHUNKSIZE}") message(STATUS "[test] settings") message(STATUS " GTEST_INCLUDE_DIR = ${GTEST_INCLUDE_DIR}") message(STATUS " GTEST_LIBRARY_PATH = ${GTEST_LIBRARY_PATH}") diff --git a/btrblocks/btrblocks.hpp b/btrblocks/btrblocks.hpp index d3c6ae3..4a4a169 100644 --- a/btrblocks/btrblocks.hpp +++ b/btrblocks/btrblocks.hpp @@ -13,6 +13,11 @@ #include "scheme/SchemeConfig.hpp" #include "scheme/SchemeType.hpp" // ------------------------------------------------------------------------------ +#ifndef CHUNKSIZE + #define CHUNKSIZE 16 // Define the CHUNKSIZE if it is not defined by an compile option + #warning "No chunk size was set, defaulting to CHUNKSIZE=16" +#endif +// ------------------------------------------------------------------------------ namespace btrblocks { // ------------------------------------------------------------------------------ // Global configuation used by the compression interface @@ -21,7 +26,7 @@ enum class SchemeSelection : uint8_t { SAMPLE, TRY_ALL }; // ------------------------------------------------------------------------------ struct BtrBlocksConfig { // clang-format off - size_t block_size{65536}; // max tuples in a single block + size_t block_size{1 << CHUNKSIZE}; // max tuples in a single block uint32_t sample_size{64}; // run size of each sample uint32_t sample_count{10}; // number of samples to take diff --git a/btrblocks/compression/BtrReader.cpp b/btrblocks/compression/BtrReader.cpp index a9c1cb7..34d3bc3 100644 --- a/btrblocks/compression/BtrReader.cpp +++ b/btrblocks/compression/BtrReader.cpp @@ -197,7 +197,7 @@ u32 BtrReader::getDecompressedDataSize(u32 index) { auto input_data = static_cast(meta->data); BitmapWrapper* bitmapWrapper = this->getBitmap(index); - u32 size = scheme.getTotalLength(input_data, meta->tuple_count, bitmapWrapper); + u32 size = scheme.getDecompressedSize(input_data, meta->tuple_count, bitmapWrapper); return size; } default: { diff --git a/btrblocks/extern/FastPFOR.cpp b/btrblocks/extern/FastPFOR.cpp index 9129b4d..2235631 100644 --- a/btrblocks/extern/FastPFOR.cpp +++ b/btrblocks/extern/FastPFOR.cpp @@ -8,9 +8,10 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wuninitialized" #include -#include +#include #include -#include +#include +#include #pragma GCC diagnostic pop // ------------------------------------------------------------------------------------- using namespace btrblocks; diff --git a/btrblocks/local.cmake b/btrblocks/local.cmake index 518d819..a3e4f29 100755 --- a/btrblocks/local.cmake +++ b/btrblocks/local.cmake @@ -82,6 +82,8 @@ target_include_directories(btrblocks PUBLIC ${BTR_PUBLIC_INCLUDE_DIR} PRIVATE ${BTR_PRIVATE_INCLUDE_DIR}) +target_compile_options(btrblocks PRIVATE "-DCHUNKSIZE=${BTRBLOCKS_CHUNKSIZE}") + # set_target_properties(btrblocks PROPERTIES PUBLIC_HEADER "${BTR_HH}") # --------------------------------------------------------------------------- diff --git a/cmake/environment.cmake b/cmake/environment.cmake index 717740f..895542b 100644 --- a/cmake/environment.cmake +++ b/cmake/environment.cmake @@ -1,6 +1,10 @@ # --------------------------------------------------------------------------- # Environment-specific settings # --------------------------------------------------------------------------- +if(NOT DEFINED BTRBLOCKS_CHUNKSIZE) + # Set BTRBLOCKS_CHUNKSIZE only if it's not already defined + set(BTRBLOCKS_CHUNKSIZE 16) +endif() if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" AND CMAKE_BUILD_TYPE MATCHES Debug) add_compile_options(-fstandalone-debug) diff --git a/cmake/fastpfor.cmake b/cmake/fastpfor.cmake index 9200e43..9d30c8f 100755 --- a/cmake/fastpfor.cmake +++ b/cmake/fastpfor.cmake @@ -9,7 +9,7 @@ ExternalProject_Add( fastpfor_src PREFIX "vendor/lemire/fastpfor" GIT_REPOSITORY "https://github.com/lemire/FastPFor.git" - GIT_TAG 773283d4a11fa2440a1b3b28fd77f775e86d7898 + GIT_TAG master TIMEOUT 10 UPDATE_COMMAND "" # to prevent rebuilding everytime INSTALL_COMMAND "" diff --git a/cmake/fsst.cmake b/cmake/fsst.cmake index 559c63e..c2cd051 100644 --- a/cmake/fsst.cmake +++ b/cmake/fsst.cmake @@ -17,7 +17,7 @@ ExternalProject_Add( fsst_src PREFIX "vendor/cwida/fsst" GIT_REPOSITORY "https://github.com/cwida/fsst.git" - GIT_TAG 0f0f9057048412da1ee48e35d516155cb7edd155 + GIT_TAG aaa3a0cedfe191607f6884a6781cc2d0e97d203c TIMEOUT 10 UPDATE_COMMAND "" # to prevent rebuilding everytime INSTALL_COMMAND "" diff --git a/cmake/tbb.cmake b/cmake/tbb.cmake index b9d4a87..73b6d5a 100755 --- a/cmake/tbb.cmake +++ b/cmake/tbb.cmake @@ -9,18 +9,17 @@ find_package(Git REQUIRED) ExternalProject_Add( tbb_src PREFIX "vendor/intel/tbb" - GIT_REPOSITORY "https://github.com/wjakob/tbb.git" - GIT_TAG b066defc0229a1e92d7a200eb3fe0f7e35945d95 + GIT_REPOSITORY "https://github.com/seb711/oneTBB.git" + GIT_TAG master TIMEOUT 10 BUILD_COMMAND make UPDATE_COMMAND "" # to prevent rebuilding everytime INSTALL_COMMAND "" CMAKE_ARGS - -DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}/vendor/tbb_cpp -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} + -DCMAKE_LIBRARY_OUTPUT_DIRECTORY:STRING=./lib + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DTBB_TEST:BOOL=OFF ) # Prepare json diff --git a/tools/conversion/decompression-speed.cpp b/tools/conversion/decompression-speed.cpp index 87ea0f8..c4c3c64 100644 --- a/tools/conversion/decompression-speed.cpp +++ b/tools/conversion/decompression-speed.cpp @@ -3,26 +3,34 @@ #include #include #include +#include // ------------------------------------------------------------------------------------- #include "gflags/gflags.h" +#define TBB_PREVIEW_GLOBAL_CONTROL 1 +#include "tbb/global_control.h" #include "tbb/parallel_for.h" -#include "tbb/task_scheduler_init.h" +#include // ------------------------------------------------------------------------------------- +#include "btrfiles.hpp" #include "common/PerfEvent.hpp" #include "common/Utils.hpp" #include "compression/BtrReader.hpp" #include "scheme/SchemePool.hpp" // ------------------------------------------------------------------------------------- DEFINE_string(btr, "btr", "Directory with btr input"); -DEFINE_int32(threads, 1, "Number of threads used. not specifying lets tbb decide"); +DEFINE_int32(threads, -1, "Number of threads used. not specifying lets tbb decide"); DEFINE_int32(column, -1, "Select a specific column to measure"); DEFINE_string(typefilter, "", "Only measure columns with given type"); //DEFINE_int32(chunk, -1, "Select a specific chunk to measure"); DEFINE_uint32(reps, 1, "Loop reps times"); DEFINE_bool(perfevent, false, "Profile with perf event if true"); -DEFINE_bool(output_summary, false, "Output a summary of total speed and size"); -DEFINE_bool(output_columns, true, "Output speeds and sizes for single columns"); +DEFINE_bool(output_summary, true, "Output a summary of total speed and size"); +DEFINE_bool(output_columns, false, "Output speeds and sizes for single columns"); DEFINE_bool(print_simd_debug, false, "Print SIMD usage debug information"); +// if verification is needed following is obligatory +DEFINE_bool(verify, false, "Verify that decompression works"); +DEFINE_string(yaml, "schema.yaml", "Schema in YAML format"); +DEFINE_string(binary, "binary", "Directory for binary output"); // ------------------------------------------------------------------------------------- using namespace btrblocks; // ------------------------------------------------------------------------------------- @@ -37,6 +45,53 @@ void reset_bitmaps(const FileMetadata *metadata, std::vector bitmap; + std::vector data; + u64 tuple_count; + bool requiresCopy; + Range range; + size_t decompressedSize; +}; +// ------------------------------------------------------------------------------------- +u64 measure_and_store(const FileMetadata *metadata, std::vector> &readers, std::vector &runtimes, std::vector &columns, std::vector>> &decompressed_data) { + // Make sure no bitmap is cached + reset_bitmaps(metadata, readers, columns); + + auto total_start_time = std::chrono::steady_clock::now(); + + tbb::parallel_for_each(columns, [&](u32 column_i) { + decompressed_data[column_i].resize(metadata->parts[column_i].num_parts); + // TODO not sure if measuring the time like that will cause problems + auto start_time = std::chrono::steady_clock::now(); + tbb::parallel_for(u32(0), metadata->parts[column_i].num_parts, [&](u32 part_i) { + auto &reader = readers[column_i][part_i]; + decompressed_data[column_i][part_i].resize(reader.getChunkCount()); + + tbb::parallel_for(u32(0), reader.getChunkCount(), [&](u32 chunk_i) { + thread_local std::vector dataDest; + + decompressed_data[column_i][part_i][chunk_i].requiresCopy = + reader.readColumn(dataDest, chunk_i); + decompressed_data[column_i][part_i][chunk_i].data = std::move(dataDest); + decompressed_data[column_i][part_i][chunk_i].bitmap = std::move(reader.getBitmap(chunk_i)->writeBITMAP()); + decompressed_data[column_i][part_i][chunk_i].tuple_count = reader.getChunkMetadata(chunk_i)->tuple_count; + decompressed_data[column_i][part_i][chunk_i].range = tuple( + reader.getPartMetadata()->offsets[chunk_i], + reader.getTupleCount(chunk_i)); + decompressed_data[column_i][part_i][chunk_i].decompressedSize = reader.getDecompressedDataSize(chunk_i); + }); + }); + auto end_time = std::chrono::steady_clock::now(); + auto runtime = std::chrono::duration_cast(end_time - start_time); + runtimes[column_i] += runtime.count(); + }); + + auto total_end_time = std::chrono::steady_clock::now(); + auto total_runtime = std::chrono::duration_cast(total_end_time - total_start_time); + return total_runtime.count(); +} +// ------------------------------------------------------------------------------------- u64 measure(const FileMetadata *metadata, std::vector> &readers, std::vector &runtimes, std::vector &columns) { // Make sure no bitmap is cached reset_bitmaps(metadata, readers, columns); @@ -103,12 +158,13 @@ int main(int argc, char **argv) { int threads; if (FLAGS_threads < 1) { - // Automatic selection - threads = -1; + // Automatic selection + tbb::global_control c(tbb::global_control::max_allowed_parallelism, + std::thread::hardware_concurrency()); } else { - threads = FLAGS_threads; + threads = FLAGS_threads; + tbb::global_control c(tbb::global_control::max_allowed_parallelism, threads); } - tbb::task_scheduler_init init(threads); // Read the metadata std::vector raw_file_metadata; @@ -202,6 +258,35 @@ int main(int argc, char **argv) { total_compressed_size += compressed_sizes[column_i]; } + + // Verify + size_t total_size_verify = 0; + if (FLAGS_verify) { + std::vector>> decompressed_data(file_metadata->num_columns); + + measure_and_store(file_metadata, readers, runtimes, columns, decompressed_data); + + const auto schema = YAML::LoadFile(FLAGS_yaml); + Relation relation = files::readDirectory(schema, FLAGS_binary.back() == '/' ? FLAGS_binary : FLAGS_binary + "/"); + auto ranges = relation.getRanges(SplitStrategy::SEQUENTIAL, -1); + + for (u32 column_i : columns) { + u32 global_chunk_counter = 0; + for (u32 part_i = 0; part_i < file_metadata->parts[column_i].num_parts; part_i++) { + for (u32 chunk_i = 0; chunk_i < decompressed_data[column_i][part_i].size(); chunk_i++, global_chunk_counter++) { + DecompressedChunkData& decompressedChunk = decompressed_data[column_i][part_i][chunk_i]; + auto input_chunk = relation.getInputChunk(ranges[global_chunk_counter], global_chunk_counter, column_i); + if (!input_chunk.compareContents(decompressedChunk.data.data(), decompressedChunk.bitmap, + decompressedChunk.tuple_count, + decompressedChunk.requiresCopy)) { + throw Generic_Exception("Decompression yields different contents"); + } + total_size_verify += input_chunk.size; + } + } + } + } + if (FLAGS_output_columns) { for (u32 column_i : columns) { double average_runtime = static_cast(runtimes[column_i]) / static_cast(FLAGS_reps); @@ -229,12 +314,7 @@ int main(int argc, char **argv) { double s = average_runtime / (1000.0 * 1000.0); double mbs = mb / s; - std::cout << "Total:" - << " " << total_compressed_size << " Bytes" - << " " << total_size << " Bytes" - << " " << average_runtime << " us" - << " " << mbs << " MB/s" - << std::endl; + std::cout << std::to_string(average_runtime) << ", " << total_compressed_size << ", " << total_size << ", " << total_size_verify << ", " << std::to_string((double)total_size / (double)total_compressed_size) << '\n'; } } // ------------------------------------------------------------------------------------- diff --git a/tools/conversion/local.cmake b/tools/conversion/local.cmake index 697fd67..15a0def 100644 --- a/tools/conversion/local.cmake +++ b/tools/conversion/local.cmake @@ -22,7 +22,7 @@ target_link_libraries(btrmeta btrblocks gflags) add_executable(decompression-speed ${BTR_CONVERSION_DIR}/decompression-speed.cpp) target_include_directories(decompression-speed PRIVATE ${BTR_INCLUDE_DIR}) -target_link_libraries(decompression-speed btrblocks tbb gflags) +target_link_libraries(decompression-speed btrfiles btrblocks tbb gflags) add_executable(decompression-speed-s3 ${BTR_CONVERSION_DIR}/decompression-speed-s3.cpp) target_include_directories(decompression-speed-s3 PRIVATE ${BTR_INCLUDE_DIR})