Skip to content

Commit ab85985

Browse files
ketorastor-oss
authored andcommitted
[refactor][store] Optimize region metrics collection for region_size.
Skip region size calcuation when the region is not modified too much. This will remarkablely reduct the performace penalty of region size collection. Signed-off-by: Ketor <[email protected]>
1 parent fb833d1 commit ab85985

12 files changed

+256
-76
lines changed

conf/index.template.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ server:
77
port: $SERVER_PORT$
88
heartbeat_interval_s: 6
99
metrics_collect_interval_s: 300
10+
approximate_size_metrics_collect_interval_s: 300
1011
scrub_vector_index_interval_s: 60
1112
worker_thread_num: 128 # must >4, worker_thread_num priority worker_thread_ratio
1213
# worker_thread_ratio: 2 # cpu core * ratio
@@ -18,6 +19,7 @@ server:
1819
region:
1920
region_max_size: 536870912 # 512MB
2021
enable_auto_split: true
22+
split_check_interval_s: 120
2123
raft:
2224
host: $RAFT_HOST$
2325
port: $RAFT_PORT$

conf/store.template.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ server:
77
port: $SERVER_PORT$
88
heartbeat_interval_s: 6
99
metrics_collect_interval_s: 300
10+
approximate_size_metrics_collect_interval_s: 300
1011
worker_thread_num: 128 # must >4, worker_thread_num priority worker_thread_ratio
1112
# worker_thread_ratio: 1 # cpu core * ratio
1213
read_worker_num: 48 # read_worker_num + write_worker_num + raft_apply_worker_num must < server.worker_thread_num
@@ -17,6 +18,7 @@ server:
1718
region:
1819
region_max_size: 268435456 # 256MB
1920
enable_auto_split: true
21+
split_check_interval_s: 120
2022
raft:
2123
host: $RAFT_HOST$
2224
port: $RAFT_PORT$

proto/common.proto

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -694,10 +694,13 @@ message RegionMetrics {
694694
RegionDefinition region_definition = 5; // region definition
695695
VectorIndexStatus vector_index_status = 6; // vector index status
696696

697-
int64 row_count = 11; // row count of this region
698-
bytes min_key = 12; // the min key of this region now exist
699-
bytes max_key = 13; // the max key of this region now exist
700-
int64 region_size = 14; // the bytes size of this region
697+
int64 row_count = 11; // row count of this region
698+
bytes min_key = 12; // the min key of this region now exist
699+
bytes max_key = 13; // the max key of this region now exist
700+
int64 region_size = 14; // the bytes size of this region
701+
int64 last_update_metrics_log_index = 15; // last update metrics log index
702+
int64 last_update_metrics_version = 16; // last update metrics version
703+
int64 last_update_metrics_timestamp = 17; // last update metrics timestamp
701704

702705
// bool is_hold_vector_index = 29; // is hold vector index
703706
VectorIndexMetrics vector_index_metrics = 20; // vector index metrics

src/common/constant.h

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -189,26 +189,6 @@ class Constant {
189189
// hnsw max elements expand number
190190
static const uint32_t kHnswMaxElementsExpandNum = 10000;
191191

192-
// crontab default interval
193-
static const int32_t kHeartbeatIntervalS = 10;
194-
static const int32_t kScanIntervalS = 30;
195-
static const int32_t kPushIntervalS = 1;
196-
static const int32_t kUpdateStateIntervalS = 10;
197-
static const int32_t kTaskListIntervalS = 1;
198-
static const int32_t kCalcMetricsIntervalS = 60;
199-
static const int32_t kRecycleOrphanIntervalS = 60;
200-
static const int32_t kRemoveWatchIntervalS = 60;
201-
static const int32_t kLeaseIntervalS = 60;
202-
static const int32_t kCompactionIntervalS = 300;
203-
static const int32_t kScrubVectorIndexIntervalS = 60;
204-
static const int32_t kApproximateSizeMetricsCollectIntervalS = 50;
205-
static const int32_t kStoreMetricsCollectIntervalS = 30;
206-
static const int32_t kRegionMetricsCollectIntervalS = 300;
207-
static const int32_t kDefaultSplitCheckIntervalS = 120;
208-
static const int32_t kRaftSnapshotIntervalS = 120;
209-
static constexpr int32_t kGcUpdateSafePointIntervalS = 60;
210-
static constexpr int32_t kGcDoGcPointIntervalS = 60;
211-
212192
// raft snapshot
213193
inline static const std::string kRaftSnapshotRegionMetaFileName = "region_meta";
214194
inline static const std::string kRaftSnapshotRegionDateFileNameSuffix = ".dingo_sst";

src/crontab/crontab.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@
1515
#include "crontab/crontab.h"
1616

1717
#include "bthread/bthread.h"
18+
#include "bthread/unstable.h"
1819
#include "common/logging.h"
1920
#include "common/role.h"
2021
#include "fmt/core.h"
21-
#include "server/server.h"
2222

2323
namespace dingodb {
2424

src/crontab/crontab.h

Lines changed: 9 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,10 @@
2020
#include <functional>
2121
#include <map>
2222
#include <memory>
23-
#include <shared_mutex>
2423
#include <string>
2524
#include <vector>
2625

27-
#include "bthread/unstable.h"
26+
#include "bthread/types.h"
2827
#include "proto/common.pb.h"
2928

3029
namespace dingodb {
@@ -39,35 +38,24 @@ struct CrontabConfig {
3938

4039
class Crontab {
4140
public:
42-
Crontab()
43-
: id(0),
44-
interval(0),
45-
max_times(0),
46-
immediately(false),
47-
run_count(0),
48-
pause(false),
49-
timer_id(0),
50-
func(nullptr),
51-
arg(nullptr) {}
52-
53-
uint32_t id;
41+
uint32_t id{0};
5442
std::string name;
5543
// unit ms
56-
int64_t interval;
44+
int64_t interval{0};
5745
// 0 is no limit
58-
uint32_t max_times;
46+
uint32_t max_times{0};
5947
// Is immediately run
60-
bool immediately;
48+
bool immediately{false};
6149
// Already run count
62-
int run_count;
50+
int run_count{0};
6351
// Is pause crontab
64-
bool pause;
52+
bool pause{false};
6553
// bthread_timer_t handler
66-
bthread_timer_t timer_id;
54+
bthread_timer_t timer_id{0};
6755
// For run target function
6856
std::function<void(void*)> func;
6957
// Delivery to func_'s argument
70-
void* arg;
58+
void* arg{nullptr};
7159
};
7260

7361
// Manage crontab use brpc::bthread_timer_add

src/engine/bdb_raw_engine.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
#include <unordered_map>
2525
#include <vector>
2626

27-
#include "bthread/types.h"
2827
#include "config/config.h"
2928
#include "db_cxx.h"
3029
#include "engine/iterator.h"

src/handler/raft_apply_handler.cc

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -624,8 +624,9 @@ int SplitHandler::Handle(std::shared_ptr<Context>, store::RegionPtr from_region,
624624
}
625625

626626
// Update region metrics min/max key policy
627+
// Update region_size in next collect region metrics
627628
if (region_metrics != nullptr) {
628-
region_metrics->UpdateMaxAndMinKeyPolicy();
629+
region_metrics->ResetMetricsForRegionVersionUpdate();
629630
}
630631

631632
return 0;
@@ -787,7 +788,8 @@ int PrepareMergeHandler::Handle(std::shared_ptr<Context>, store::RegionPtr sourc
787788
}
788789

789790
int CommitMergeHandler::Handle(std::shared_ptr<Context>, store::RegionPtr target_region, std::shared_ptr<RawEngine>,
790-
const pb::raft::Request &req, store::RegionMetricsPtr, int64_t, int64_t /*log_id*/) {
791+
const pb::raft::Request &req, store::RegionMetricsPtr region_metrics, int64_t,
792+
int64_t /*log_id*/) {
791793
assert(target_region != nullptr);
792794
const auto &request = req.commit_merge();
793795
auto store_region_meta = GET_STORE_REGION_META;
@@ -934,6 +936,12 @@ int CommitMergeHandler::Handle(std::shared_ptr<Context>, store::RegionPtr target
934936
request.entries().size(), actual_apply_log_count, target_region->EpochToString(), target_region->RangeToString(),
935937
Helper::TimestampMs() - start_time);
936938

939+
// Update region metrics min/max key policy
940+
// Update region_size in next collect region metrics
941+
if (region_metrics != nullptr) {
942+
region_metrics->ResetMetricsForRegionVersionUpdate();
943+
}
944+
937945
return 0;
938946
}
939947

src/metrics/store_metrics_manager.cc

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,11 @@
2424
#include "common/constant.h"
2525
#include "common/helper.h"
2626
#include "common/logging.h"
27-
#include "common/role.h"
2827
#include "config/config_manager.h"
2928
#include "fmt/core.h"
29+
#include "gflags/gflags.h"
3030
#include "proto/common.pb.h"
3131
#include "server/server.h"
32-
#include "vector/vector_index_manager.h"
3332

3433
namespace dingodb {
3534

@@ -312,20 +311,67 @@ std::vector<std::vector<store::RegionPtr>> GenBatchRegion(std::vector<store::Reg
312311
return result;
313312
}
314313

314+
DEFINE_int64(collect_approximate_size_log_index_interval, 10,
315+
"Collecting approximate size log index interval, the region has raft log index greater than this value "
316+
"will be collected region size");
317+
315318
bool StoreRegionMetrics::CollectApproximateSizeMetrics() {
316319
auto store_region_meta = GET_STORE_REGION_META;
317320
auto region_metricses = GetAllMetrics();
318321

319322
std::vector<store::RegionPtr> need_collect_rocks_regions;
320323
std::vector<store::RegionPtr> need_collect_bdb_regions;
324+
321325
for (const auto& region_metrics : region_metricses) {
326+
DINGO_LOG(DEBUG) << fmt::format(
327+
"[metrics.region] collect approximate size metrics start, region({}) log_index_id({})", region_metrics->Id(),
328+
region_metrics->LastLogIndex());
329+
322330
auto region = store_region_meta->GetRegion(region_metrics->Id());
323331
if (region == nullptr) {
332+
DINGO_LOG(INFO) << fmt::format("[metrics.region] skip collect approximate size metrics, region({}) not exist",
333+
region_metrics->Id());
324334
continue;
325335
}
336+
326337
if (region->State() != pb::common::NORMAL) {
338+
DINGO_LOG(INFO) << fmt::format(
339+
"[metrics.region] skip collect approximate size metrics for state not NORMAL, region({}) state({})",
340+
region->Id(), pb::common::StoreRegionState_Name(region->State()));
327341
continue;
328342
}
343+
344+
// first update region_version in region metrics
345+
auto region_version = region->Epoch().version();
346+
if (region_version > region_metrics->RegionVersion()) {
347+
region_metrics->SetRegionVersion(region_version);
348+
}
349+
350+
// get region log index id
351+
auto log_index_id = region_metrics->LastLogIndex();
352+
353+
// get last update metrics log index and version
354+
auto last_update_metrics_log_index = region_metrics->LastUpdateMetricsLogIndex();
355+
auto last_update_metrics_version = region_metrics->LastUpdateMetricsVersion();
356+
357+
// if the region version has been updated, update the last update metrics version, this is done in SetRegionSize
358+
// else if the region version is equal and the difference between the current log index and the last update
359+
// metrics log index is less than the interval, skip it
360+
if (region_version <= last_update_metrics_version &&
361+
(log_index_id - last_update_metrics_log_index < FLAGS_collect_approximate_size_log_index_interval)) {
362+
DINGO_LOG(INFO) << fmt::format(
363+
"[metrics.region] skip collect approximate size metrics, region({}) log_index_id({}) region_version({}) "
364+
"last_update_metrics_log_index({}) last_update_metrics_version({})",
365+
region->Id(), log_index_id, region_version, last_update_metrics_log_index, last_update_metrics_version);
366+
367+
continue;
368+
}
369+
370+
DINGO_LOG(INFO) << fmt::format(
371+
"[metrics.region] collect approximate size metrics, region({}) log_index_id({}) version({}) "
372+
"last_update_metrics_log_index({}) last_update_metrics_version({})",
373+
region->Id(), log_index_id, region_version, last_update_metrics_log_index, last_update_metrics_version);
374+
329375
if (region->GetRawEngineType() == pb::common::RAW_ENG_ROCKSDB) {
330376
need_collect_rocks_regions.push_back(region);
331377
} else if (region->GetRawEngineType() == pb::common::RAW_ENG_BDB) {
@@ -346,6 +392,11 @@ bool StoreRegionMetrics::CollectApproximateSizeMetrics() {
346392
auto region_metrics = GetMetrics(region_id);
347393
if (region_metrics != nullptr) {
348394
region_metrics->SetRegionSize(size);
395+
region_metrics->UpdateLastUpdateMetricsLogIndex();
396+
397+
DINGO_LOG(INFO) << fmt::format(
398+
"[metrics.region] get rocksdb region approximate size region({}) size({}) elapsed time[{} ms]", region_id,
399+
size, Helper::TimestampMs() - start_time);
349400
}
350401
}
351402
}
@@ -368,6 +419,11 @@ bool StoreRegionMetrics::CollectApproximateSizeMetrics() {
368419
auto region_metrics = GetMetrics(region_id);
369420
if (region_metrics != nullptr) {
370421
region_metrics->SetRegionSize(size);
422+
region_metrics->UpdateLastUpdateMetricsLogIndex();
423+
424+
DINGO_LOG(INFO) << fmt::format(
425+
"[metrics.region] get bdb region approximate size region({}) size({}) elapsed time[{} ms]", region_id,
426+
size, Helper::TimestampMs() - start_time);
371427
}
372428
}
373429
}

0 commit comments

Comments
 (0)