Skip to content

Commit 8c67746

Browse files
committed
Code review s2e1
1 parent 0a1990b commit 8c67746

File tree

2 files changed

+166
-56
lines changed

2 files changed

+166
-56
lines changed

src/VecSim/algorithms/svs/svs_tiered.h

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -212,9 +212,9 @@ class TieredSVSIndex : public VecSimTieredIndex<DataType, float> {
212212
// oldId is the index of the label in flat index before the swap.
213213
// newId is the index of the label in flat index after the swap.
214214
// if oldId == newId, it means that the vector was not moved in the Flat index, but was removed
215-
// at the end of the Flat index.
216-
// if label == SKIP_LABEL, it means that the vector was not moved/removed in the Flat index, but
217-
// updated in-place
215+
// from the end of the Flat index (no id swaps occurred internally).
216+
// if label == SKIP_LABEL, it means that the vector was not moved in the Flat index, but
217+
// updated in-place (hence was already removed and no need to remove it again)
218218
using swap_record = std::tuple<labelType, idType, idType>;
219219
constexpr static size_t SKIP_LABEL = std::numeric_limits<labelType>::max();
220220
std::vector<swap_record> swaps_journal;
@@ -741,6 +741,8 @@ class TieredSVSIndex : public VecSimTieredIndex<DataType, float> {
741741
const auto ft_ret = this->frontendIndex->addVector(blob, label);
742742

743743
if (ft_ret == 0) { // Vector was overriden - add 'skiping' swap to the journal.
744+
assert(!this->backendIndex->isMultiValue() &&
745+
"addVector() may return 0 for single value indices only");
744746
for (auto id : this->frontendIndex->getElementIds(label)) {
745747
this->swaps_journal.emplace_back(SKIP_LABEL, id, id);
746748
}
@@ -770,6 +772,14 @@ class TieredSVSIndex : public VecSimTieredIndex<DataType, float> {
770772

771773
// Delete vector from the frontend index.
772774
auto updated_ids = this->frontendIndex->deleteVectorAndGetUpdatedIds(label);
775+
776+
assert(std::all_of(updated_ids.begin(), updated_ids.end(),
777+
[&deleting_ids](const auto &pair) {
778+
return std::find(deleting_ids.begin(), deleting_ids.end(),
779+
pair.first) != deleting_ids.end();
780+
}) &&
781+
"updated_ids should be a subset of deleting_ids");
782+
773783
// Record swaps in the journal.
774784
for (auto id : deleting_ids) {
775785
auto it = updated_ids.find(id);
@@ -779,20 +789,12 @@ class TieredSVSIndex : public VecSimTieredIndex<DataType, float> {
779789
auto oldId = it->second.first;
780790
auto oldLabel = it->second.second;
781791
this->swaps_journal.emplace_back(oldLabel, oldId, newId);
782-
// Erase in assertion for debug compilation only
783-
// NOTE: There are extra braces to wrap comma C++ operator
784-
assert((updated_ids.erase(it), true));
785792
} else {
786793
// No swap, just delete is marked by oldId == newId == deleted id
787-
this->swaps_journal.emplace_back(label, id, id);
794+
this->swaps_journal.emplace_back(SKIP_LABEL, id, id);
788795
}
789796
}
790797

791-
// After processing all deleting_ids, updated_ids should be empty in debug
792-
// compilation.
793-
assert(updated_ids.empty() &&
794-
"updated_ids should be empty after processing all deleting_ids");
795-
796798
return deleting_ids.size();
797799
}
798800

tests/unit/test_svs_tiered.cpp

Lines changed: 152 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -2839,16 +2839,13 @@ TYPED_TEST(SVSTieredIndexTestBasic, switchDeleteModes) {
28392839
EXPECT_EQ(sz_f + sz_b, n);
28402840
}
28412841

2842-
TYPED_TEST(SVSTieredIndexTest, testSwapJournal) {
2842+
TYPED_TEST(SVSTieredIndexTestBasic, testSwapJournalSingle) {
28432843
// Create TieredSVS index instance with a mock queue.
28442844
const size_t dim = 4;
28452845
const size_t n = 15;
2846-
const bool is_multi = TypeParam::isMulti();
28472846

2848-
SVSParams params = {.type = TypeParam::get_index_type(),
2849-
.dim = dim,
2850-
.metric = VecSimMetric_L2,
2851-
.multi = is_multi};
2847+
SVSParams params = {
2848+
.type = TypeParam::get_index_type(), .dim = dim, .metric = VecSimMetric_L2, .multi = false};
28522849
VecSimParams svs_params = CreateParams(params);
28532850
auto mock_thread_pool = tieredIndexMock();
28542851

@@ -2898,14 +2895,10 @@ TYPED_TEST(SVSTieredIndexTest, testSwapJournal) {
28982895

28992896
// update job paused, we have vectors 0-(n-1) in the index, let's do index modifications
29002897

2901-
// For single-value index, we expect to override the vector.
2902-
// For multi-value index, we expect to add a new vector.
2903-
const int update_count = is_multi ? 1 : 0;
2904-
29052898
// Remove vector label=n-2, it is copied to backend index.
29062899
EXPECT_EQ(VecSimIndex_DeleteVector(tiered_index, n - 2), 2);
2907-
// Add/update vector label=1.
2908-
EXPECT_EQ(GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, 1, 10), update_count);
2900+
// Update vector label=1.
2901+
EXPECT_EQ(GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, 1, 10), 0);
29092902
// Add a new vector
29102903
EXPECT_EQ(GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, n, n), 1);
29112904
// Add another one
@@ -2914,10 +2907,10 @@ TYPED_TEST(SVSTieredIndexTest, testSwapJournal) {
29142907
EXPECT_EQ(VecSimIndex_DeleteVector(tiered_index, 0), 2);
29152908
// Remove the last vector copied to backend index
29162909
EXPECT_EQ(VecSimIndex_DeleteVector(tiered_index, n - 1), 2);
2917-
// Add/update vector label=2.
2918-
EXPECT_EQ(GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, 2, 20), update_count);
2919-
// Remove vector label=2: for multi: old is copied to backend , old + new are in flat
2920-
EXPECT_EQ(VecSimIndex_DeleteVector(tiered_index, 2), 1 + 2 * update_count);
2910+
// Update vector label=2.
2911+
EXPECT_EQ(GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, 2, 20), 0);
2912+
// Remove vector label=2.
2913+
EXPECT_EQ(VecSimIndex_DeleteVector(tiered_index, 2), 1);
29212914
// Remove vector label=n - in flat only
29222915
EXPECT_EQ(VecSimIndex_DeleteVector(tiered_index, n), 1);
29232916
// Add vector (n-1) again
@@ -2931,30 +2924,17 @@ TYPED_TEST(SVSTieredIndexTest, testSwapJournal) {
29312924

29322925
mock_thread_pool.thread_pool_join();
29332926

2934-
if (is_multi) {
2935-
// For multi-value index, following vectors should be in the index:
2936-
// 0: deleted, 1: (1,10), 2: deleted, 3:3, ..., n-2: deleted n-1: 10(n-1), n+1: n+1;
2937-
// total: n-2 labels, n-1 vectors
2938-
ASSERT_EQ(tiered_index->indexSize(), n - 1);
2939-
ASSERT_EQ(tiered_index->indexLabelCount(), n - 2);
2940-
2941-
// Backend index: 0:deleted, 1:1, 2:deleted, 3:3, ..., n-2:deleted, n-1:deleted; total: n-4
2942-
EXPECT_EQ(tiered_index->GetBackendIndex()->indexSize(), n - 4);
2943-
// Frontend index: 1:10, n-1:10(n-1), n+1:n+1
2944-
ASSERT_EQ(tiered_index->GetFlatIndex()->indexSize(), 3);
2945-
} else {
2946-
// For single-value index, following vectors should be in the index:
2947-
// 0:deleted, 1: 10, 2: deleted, 3:3, ..., n-2:deleted n-1: 10(n-1), n+1: n+1;
2948-
// total: n-2 vectors and labels
2949-
ASSERT_EQ(tiered_index->indexSize(), n - 2);
2950-
ASSERT_EQ(tiered_index->indexLabelCount(), n - 2);
2927+
// For single-value index, following vectors should be in the index:
2928+
// 0:deleted, 1: 10, 2: deleted, 3:3, ..., n-2:deleted n-1: 10(n-1), n+1: n+1;
2929+
// total: n-2 vectors and labels
2930+
ASSERT_EQ(tiered_index->indexSize(), n - 2);
2931+
ASSERT_EQ(tiered_index->indexLabelCount(), n - 2);
29512932

2952-
// Backend index: 0:deleted, 1:deleted, 2:deleted, 3:3, ..., n-2:deleted, n-1:deleted;
2953-
// total: n-5
2954-
EXPECT_EQ(tiered_index->GetBackendIndex()->indexSize(), n - 5);
2955-
// Frontend index: 1:10, n-1:10(n-1), n+1:n+1
2956-
ASSERT_EQ(tiered_index->GetFlatIndex()->indexSize(), 3);
2957-
}
2933+
// Backend index: 0:deleted, 1:deleted, 2:deleted, 3:3, ..., n-2:deleted, n-1:deleted;
2934+
// total: n-5
2935+
EXPECT_EQ(tiered_index->GetBackendIndex()->indexSize(), n - 5);
2936+
// Frontend index: 1:10, n-1:10(n-1), n+1:n+1
2937+
ASSERT_EQ(tiered_index->GetFlatIndex()->indexSize(), 3);
29582938

29592939
double abs_err = 1e-2; // Allow a larger relative error for quantization.
29602940
TEST_DATA_T expected_vector[dim];
@@ -2970,13 +2950,141 @@ TYPED_TEST(SVSTieredIndexTest, testSwapJournal) {
29702950
// Vector label=1, with value 10 should be in the flat index.
29712951
GenerateVector<TEST_DATA_T>(expected_vector, dim, 10);
29722952
ASSERT_NEAR(tiered_index->getDistanceFrom_Unsafe(1, expected_vector), 0, abs_err);
2973-
if (is_multi) {
2974-
// For multi-value index, there is vector label=1 kept in the backend.
2975-
// We expect the minimal distance for the query 1.0 to be taken from backend
2976-
GenerateVector<TEST_DATA_T>(expected_vector, dim, 1);
2977-
ASSERT_NEAR(tiered_index->getDistanceFrom_Unsafe(1, expected_vector), 0, abs_err);
2953+
2954+
// Vector label 2 - deleted
2955+
GenerateVector<TEST_DATA_T>(expected_vector, dim, 2);
2956+
ASSERT_TRUE(std::isnan(tiered_index->getDistanceFrom_Unsafe(2, expected_vector)));
2957+
2958+
// Vectors labels [3,n-3] - unchanged
2959+
for (size_t i = 3; i < n - 2; i++) {
2960+
GenerateVector<TEST_DATA_T>(expected_vector, dim, i);
2961+
ASSERT_NEAR(tiered_index->getDistanceFrom_Unsafe(i, expected_vector), 0, abs_err);
2962+
}
2963+
2964+
// Vector label n-1 - deleted and re-added
2965+
GenerateVector<TEST_DATA_T>(expected_vector, dim, (n - 1) * 10);
2966+
ASSERT_NEAR(tiered_index->getDistanceFrom_Unsafe(n - 1, expected_vector), 0, abs_err);
2967+
2968+
// Vector label n+1 - added
2969+
GenerateVector<TEST_DATA_T>(expected_vector, dim, (n + 1));
2970+
ASSERT_NEAR(tiered_index->getDistanceFrom_Unsafe(n + 1, expected_vector), 0, abs_err);
2971+
}
2972+
2973+
TYPED_TEST(SVSTieredIndexTestBasic, testSwapJournalMulti) {
2974+
// Create TieredSVS index instance with a mock queue.
2975+
const size_t dim = 4;
2976+
const size_t n = 15;
2977+
2978+
SVSParams params = {
2979+
.type = TypeParam::get_index_type(), .dim = dim, .metric = VecSimMetric_L2, .multi = true};
2980+
VecSimParams svs_params = CreateParams(params);
2981+
auto mock_thread_pool = tieredIndexMock();
2982+
2983+
// Forcibly set the training threshold to a value that will trigger the update job
2984+
// for first vector only.
2985+
auto *tiered_index = this->CreateTieredSVSIndex(svs_params, mock_thread_pool, 1, n * 100);
2986+
ASSERT_INDEX(tiered_index);
2987+
auto allocator = tiered_index->getAllocator();
2988+
2989+
// Add n vectors to the index.
2990+
for (size_t i = 0; i < n; i++) {
2991+
TEST_DATA_T vector[dim];
2992+
GenerateVector<TEST_DATA_T>(vector, dim, i);
2993+
tiered_index->addVector(vector, i);
29782994
}
29792995

2996+
// Pause the update job after svs index update
2997+
std::mutex mtx;
2998+
bool added_to_svs = false;
2999+
bool continue_job = false;
3000+
std::condition_variable cv;
3001+
3002+
auto tracing_callback = [&]() {
3003+
{
3004+
std::unique_lock lock(mtx);
3005+
added_to_svs = true; // Indicate that we are waiting for the update job to start.
3006+
}
3007+
cv.notify_one(); // Notify that the update job has started.
3008+
{
3009+
std::unique_lock lock(mtx);
3010+
cv.wait(lock, [&] { return continue_job; }); // Wait until we continue.
3011+
}
3012+
};
3013+
tiered_index->registerTracingCallback("UpdateJob::after_add_to_svs", tracing_callback);
3014+
3015+
// Launch the BG threads loop that takes jobs from the queue and executes them.
3016+
mock_thread_pool.init_threads();
3017+
3018+
{
3019+
// IMPORTANT!: Do not use ASSERT here, as it will not release the mutex and will cause a
3020+
// deadlock. Use EXPECT instead, so we can continue the test even if the condition is not
3021+
// met.
3022+
3023+
// Wait for the update job to start.
3024+
std::unique_lock lock(mtx);
3025+
EXPECT_TRUE(cv.wait_for(lock, std::chrono::seconds(100), [&] { return added_to_svs; }));
3026+
3027+
// update job paused, we have vectors 0-(n-1) in the index, let's do index modifications
3028+
3029+
// Remove vector label=n-2, it is copied to backend index.
3030+
EXPECT_EQ(VecSimIndex_DeleteVector(tiered_index, n - 2), 2);
3031+
// Add one more vector label=1.
3032+
EXPECT_EQ(GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, 1, 10), 1);
3033+
// Add a new vector
3034+
EXPECT_EQ(GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, n, n), 1);
3035+
// Add another one
3036+
EXPECT_EQ(GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, n + 1, n + 1), 1);
3037+
// Remove vector label=0, it is copied to backend index.
3038+
EXPECT_EQ(VecSimIndex_DeleteVector(tiered_index, 0), 2);
3039+
// Remove the last vector copied to backend index
3040+
EXPECT_EQ(VecSimIndex_DeleteVector(tiered_index, n - 1), 2);
3041+
// Add one more vector label=2.
3042+
EXPECT_EQ(GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, 2, 20), 1);
3043+
// Remove vector label=2: for multi: old is copied to backend , old + new are in flat
3044+
EXPECT_EQ(VecSimIndex_DeleteVector(tiered_index, 2), 3);
3045+
// Remove vector label=n - in flat only
3046+
EXPECT_EQ(VecSimIndex_DeleteVector(tiered_index, n), 1);
3047+
// Add vector (n-1) again
3048+
EXPECT_EQ(GenerateAndAddVector<TEST_DATA_T>(tiered_index, dim, n - 1, (n - 1) * 10), 1);
3049+
3050+
continue_job = true; // Indicate that we can continue the update job.
3051+
}
3052+
3053+
// continue the update job
3054+
cv.notify_all(); // Notify that we can continue.
3055+
3056+
mock_thread_pool.thread_pool_join();
3057+
3058+
// For multi-value index, following vectors should be in the index:
3059+
// 0: deleted, 1: (1,10), 2: deleted, 3:3, ..., n-2: deleted n-1: 10(n-1), n+1: n+1;
3060+
// total: n-2 labels, n-1 vectors
3061+
ASSERT_EQ(tiered_index->indexSize(), n - 1);
3062+
ASSERT_EQ(tiered_index->indexLabelCount(), n - 2);
3063+
3064+
// Backend index: 0:deleted, 1:1, 2:deleted, 3:3, ..., n-2:deleted, n-1:deleted; total: n-4
3065+
EXPECT_EQ(tiered_index->GetBackendIndex()->indexSize(), n - 4);
3066+
// Frontend index: 1:10, n-1:10(n-1), n+1:n+1
3067+
ASSERT_EQ(tiered_index->GetFlatIndex()->indexSize(), 3);
3068+
3069+
double abs_err = 1e-2; // Allow a larger relative error for quantization.
3070+
TEST_DATA_T expected_vector[dim];
3071+
3072+
// Vector label 0 - deleted
3073+
GenerateVector<TEST_DATA_T>(expected_vector, dim, 0);
3074+
ASSERT_TRUE(std::isnan(tiered_index->getDistanceFrom_Unsafe(0, expected_vector)));
3075+
3076+
// Vector label n-2 - deleted
3077+
GenerateVector<TEST_DATA_T>(expected_vector, dim, 0);
3078+
ASSERT_TRUE(std::isnan(tiered_index->getDistanceFrom_Unsafe(n - 2, expected_vector)));
3079+
3080+
// There are 2 vectors labeled "1" with values 1 in backend and 10 in flat.
3081+
// We expect the minimal distance for the query 10 to be taken from flat index.
3082+
GenerateVector<TEST_DATA_T>(expected_vector, dim, 10);
3083+
ASSERT_NEAR(tiered_index->getDistanceFrom_Unsafe(1, expected_vector), 0, abs_err);
3084+
// And the minimal distance for the query 1.0 to be taken from backend
3085+
GenerateVector<TEST_DATA_T>(expected_vector, dim, 1);
3086+
ASSERT_NEAR(tiered_index->getDistanceFrom_Unsafe(1, expected_vector), 0, abs_err);
3087+
29803088
// Vector label 2 - deleted
29813089
GenerateVector<TEST_DATA_T>(expected_vector, dim, 2);
29823090
ASSERT_TRUE(std::isnan(tiered_index->getDistanceFrom_Unsafe(2, expected_vector)));

0 commit comments

Comments
 (0)