From 9903d7f3accba8655b047b431c278741b663f05d Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Mon, 30 Mar 2026 15:06:29 +0530 Subject: [PATCH 01/28] filter --- src/filter/category_index.hpp | 15 +-------------- src/filter/filter.hpp | 3 ++- 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/src/filter/category_index.hpp b/src/filter/category_index.hpp index 58ffa62c69..7a75133601 100644 --- a/src/filter/category_index.hpp +++ b/src/filter/category_index.hpp @@ -175,26 +175,13 @@ namespace ndd { return bitmap.contains(id); } - void add_batch(const std::string& field, - const std::string& value, - const std::vector& ids) { - if(ids.empty()) { - return; - } - std::string filter_key = format_filter_key(field, value); - ndd::RoaringBitmap bitmap = get_bitmap_internal(filter_key); - for(const auto& id : ids) { - bitmap.add(id); - } - store_bitmap_internal(filter_key, bitmap); - } - // Helper for batch operations where key is already formatted void add_batch_by_key(const std::string& key, const std::vector& ids) { if(ids.empty()) { return; } ndd::RoaringBitmap bitmap = get_bitmap_internal(key); + //TODO: use addMany instead of add for(const auto& id : ids) { bitmap.add(id); } diff --git a/src/filter/filter.hpp b/src/filter/filter.hpp index 392fe2b404..9d6a18fec7 100644 --- a/src/filter/filter.hpp +++ b/src/filter/filter.hpp @@ -417,7 +417,8 @@ class Filter { } if(type == FieldType::Unknown) { - LOG_DEBUG("Unsupported filter type for field '" << field << "'"); + /*This should ideally be an error or atleast an info log.*/ + LOG_INFO("Unsupported filter type for field '" << field << "'"); continue; } From 159e205db9c6ac21108927f7dfe3f3d2c6672934 Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Tue, 7 Apr 2026 11:23:04 +0530 Subject: [PATCH 02/28] removing dead code --- src/core/ndd.hpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index 55f6e5bc57..2700a688e5 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -1472,16 +1472,6 @@ class IndexManager { } } - std::optional> searchKNN(const std::string& index_id, - const std::vector& query, - size_t k, - const nlohmann::json& filter_array, - ndd::FilterParams params = {}, - bool include_vectors = false, - size_t ef = 0) { - return searchKNN(index_id, query, {}, {}, k, filter_array, params, include_vectors, ef); - } - std::optional> searchKNN(const std::string& index_id, const std::vector& query, From 693badb81ef5ccc3dfaa97cbfee364eb5920127a Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Wed, 15 Apr 2026 05:59:43 +0000 Subject: [PATCH 03/28] unified implementation of add_filters_from_json --- src/filter/filter.hpp | 40 +--------------------------------------- 1 file changed, 1 insertion(+), 39 deletions(-) diff --git a/src/filter/filter.hpp b/src/filter/filter.hpp index 9d6a18fec7..79a6e91dd1 100644 --- a/src/filter/filter.hpp +++ b/src/filter/filter.hpp @@ -472,45 +472,7 @@ class Filter { } void add_filters_from_json(ndd::idInt numeric_id, const std::string& filter_json) { - try { - auto j = nlohmann::json::parse(filter_json); - for(const auto& [field, value] : j.items()) { - FieldType type = FieldType::Unknown; - if(value.is_boolean()) { - type = FieldType::Bool; - } else if(value.is_number()) { - type = FieldType::Number; - } else if(value.is_string()) { - type = FieldType::String; - } - - if(type == FieldType::Unknown) { - LOG_DEBUG("Unsupported filter type for field '" << field << "'"); - continue; - } - - if(!register_field_type(field, type)) { - LOG_ERROR(1205, index_id_, "Type mismatch for field '" << field << "'"); - continue; - } - - if(value.is_string()) { - add_to_filter(field, value.get(), numeric_id); - } else if(value.is_number()) { - uint32_t sortable_val; - if(value.is_number_integer()) { - sortable_val = ndd::filter::int_to_sortable(value.get()); - } else { - sortable_val = ndd::filter::float_to_sortable(value.get()); - } - numeric_index_->put(field, numeric_id, sortable_val); - } else if(value.is_boolean()) { - add_to_filter(field, value.get() ? "1" : "0", numeric_id); - } - } - } catch(const std::exception& e) { - LOG_ERROR(1206, index_id_, "Error adding filters: " << e.what()); - } + add_filters_from_json_batch({{numeric_id, filter_json}}); } void remove_filters_from_json(ndd::idInt numeric_id, const std::string& filter_json) { From 7dd581f4b4cd3b3a7425796eb337d7219d044820 Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Thu, 16 Apr 2026 03:41:41 +0000 Subject: [PATCH 04/28] grouping numeric insertions for transactionality and performance --- src/filter/filter.hpp | 21 +++++++++++++++++---- src/filter/numeric_index.hpp | 29 +++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/src/filter/filter.hpp b/src/filter/filter.hpp index 79a6e91dd1..1e68932087 100644 --- a/src/filter/filter.hpp +++ b/src/filter/filter.hpp @@ -401,6 +401,9 @@ class Filter { // Create a map to collect IDs for each filter std::unordered_map> filter_to_ids; + filter_to_ids.reserve(id_filter_pairs.size()); + std::vector pending_numeric; + pending_numeric.reserve(id_filter_pairs.size()); // Group IDs by filter for(const auto& [numeric_id, filter_json] : id_filter_pairs) { @@ -429,20 +432,19 @@ class Filter { if(value.is_string()) { std::string filter_key = format_filter_key(field, value.get()); - filter_to_ids[filter_key].push_back(numeric_id); + filter_to_ids[filter_key].emplace_back(numeric_id); } else if(value.is_number()) { - // Use Numeric Index for numbers uint32_t sortable_val; if(value.is_number_integer()) { sortable_val = ndd::filter::int_to_sortable(value.get()); } else { sortable_val = ndd::filter::float_to_sortable(value.get()); } - numeric_index_->put(field, numeric_id, sortable_val); + pending_numeric.emplace_back(field, numeric_id, sortable_val); } else if(value.is_boolean()) { std::string filter_key = format_filter_key(field, value.get() ? "1" : "0"); - filter_to_ids[filter_key].push_back(numeric_id); + filter_to_ids[filter_key].emplace_back(numeric_id); } else { LOG_WARN(1203, index_id_, @@ -456,6 +458,17 @@ class Filter { } } + /** + * XXX: For transactional correctness of filter adds, all the filters + * should be added in a single transaction. + * For now, they are being added in two different transactions. + * one for numeric_index and other for labels. + */ + + if(!pending_numeric.empty()) { + numeric_index_->put_batch(pending_numeric); + } + // Process each filter with its batch of IDs for(const auto& [filter_key, ids] : filter_to_ids) { add_to_filter_batch(filter_key, ids); diff --git a/src/filter/numeric_index.hpp b/src/filter/numeric_index.hpp index c002652137..4bf1da8aa8 100644 --- a/src/filter/numeric_index.hpp +++ b/src/filter/numeric_index.hpp @@ -14,6 +14,17 @@ namespace ndd { namespace filter { + struct NumericBatchEntry { + std::string field; + ndd::idInt id; + uint32_t value; + + NumericBatchEntry(std::string field_in, ndd::idInt id_in, uint32_t value_in) : + field(std::move(field_in)), + id(id_in), + value(value_in) {} + }; + // --- Sortable Key Utilities --- inline uint32_t float_to_sortable(float f) { uint32_t i; @@ -262,6 +273,24 @@ namespace ndd { } } + void put_batch(const std::vector& entries) { + if(entries.empty()) { + return; + } + + MDBX_txn* txn; + mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); + try { + for(const auto& entry : entries) { + put_internal(txn, entry.field, entry.id, entry.value); + } + mdbx_txn_commit(txn); + } catch(...) { + mdbx_txn_abort(txn); + throw; + } + } + void remove(const std::string& field, ndd::idInt id) { MDBX_txn* txn; mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); From 89e9df061caf70de4c49f755cfbcbedcd797af51 Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Thu, 16 Apr 2026 04:46:28 +0000 Subject: [PATCH 05/28] addMany instead of a looped add --- src/filter/category_index.hpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/filter/category_index.hpp b/src/filter/category_index.hpp index 7a75133601..f898fa67c8 100644 --- a/src/filter/category_index.hpp +++ b/src/filter/category_index.hpp @@ -181,10 +181,7 @@ namespace ndd { return; } ndd::RoaringBitmap bitmap = get_bitmap_internal(key); - //TODO: use addMany instead of add - for(const auto& id : ids) { - bitmap.add(id); - } + bitmap.addMany(ids.size(), ids.data()); store_bitmap_internal(key, bitmap); } From eda67ec7cedd5be2e8ff34646f24162360fd2575 Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Thu, 16 Apr 2026 06:27:47 +0000 Subject: [PATCH 06/28] cleanup --- docs/filter.md | 2 +- src/filter/numeric_index.hpp | 12 ------------ 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/docs/filter.md b/docs/filter.md index 7340840409..3f4b6ce5e4 100644 --- a/docs/filter.md +++ b/docs/filter.md @@ -25,7 +25,7 @@ The system prioritizes **Pre-Filtering** followed by an adaptive search executio *Optimized for range queries, high compression, and sequential access.* ### 2.1. Storage Architecture (Hybrid Bucket) -The database (LMDB) acts as a coarse-grained B+ Tree. +The database (LMDB) acts as a coarse-grained B+ Tree. NumericIndex opens two MDBX named databases: "numeric_forward" and "numeric_inverted" * **Key:** `[FieldID] + [Base_Value_32bit]`. * Floats are mapped to lexicographically ordered integers to preserve sort order. * Keys are stored in Big-Endian to support native cursor iteration. diff --git a/src/filter/numeric_index.hpp b/src/filter/numeric_index.hpp index 4bf1da8aa8..d56b85ecab 100644 --- a/src/filter/numeric_index.hpp +++ b/src/filter/numeric_index.hpp @@ -261,18 +261,6 @@ namespace ndd { } } - void put(const std::string& field, ndd::idInt id, uint32_t value) { - MDBX_txn* txn; - mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); - try { - put_internal(txn, field, id, value); - mdbx_txn_commit(txn); - } catch(...) { - mdbx_txn_abort(txn); - throw; - } - } - void put_batch(const std::vector& entries) { if(entries.empty()) { return; From 0ff2f589432298c8d1a5b87a724bdfeaf9ff2cb5 Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Thu, 16 Apr 2026 06:30:12 +0000 Subject: [PATCH 07/28] put batch todo comments --- src/filter/numeric_index.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/filter/numeric_index.hpp b/src/filter/numeric_index.hpp index d56b85ecab..0ce2c57e4b 100644 --- a/src/filter/numeric_index.hpp +++ b/src/filter/numeric_index.hpp @@ -261,6 +261,12 @@ namespace ndd { } } + /** + * TODO: + * 1. comprehensive error print and return. + * If there is an error here, there should be a way to reverse + * vector add operation. + */ void put_batch(const std::vector& entries) { if(entries.empty()) { return; From fa9aa992b9a18a19a1a3485ff5a568d8ef8bb99f Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Thu, 16 Apr 2026 11:47:54 +0000 Subject: [PATCH 08/28] commenting for better understanding --- src/filter/numeric_index.hpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/filter/numeric_index.hpp b/src/filter/numeric_index.hpp index 0ce2c57e4b..960545c0ec 100644 --- a/src/filter/numeric_index.hpp +++ b/src/filter/numeric_index.hpp @@ -53,6 +53,14 @@ namespace ndd { // --- Bucket Structure (Hybrid) --- struct Bucket { + /** + * XXX: Ideally this bucket should be page size + * bounded. Currently it is difficult to do that + * here because the size of summary_bitmap depends + * on the kind of userspace filter upserts and not + * the number of them. + */ + static constexpr size_t MAX_SIZE = 1024; static constexpr uint32_t MAX_DELTA = 65535; @@ -350,6 +358,11 @@ namespace ndd { rc = mdbx_cursor_get(cursor, &key, &data, MDBX_PREV); } } else if (rc == MDBX_NOTFOUND) { + /** + * The only possible bucket that could still contain + * value is the very last bucket in the database. + * Hence jumping there. + */ rc = mdbx_cursor_get(cursor, &key, &data, MDBX_LAST); } From 704725598acf773f5f9c0f24c5e77dbe8322cee1 Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Fri, 24 Apr 2026 11:46:21 +0000 Subject: [PATCH 09/28] name changes --- src/filter/filter.hpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/filter/filter.hpp b/src/filter/filter.hpp index 1e68932087..5d581e47b4 100644 --- a/src/filter/filter.hpp +++ b/src/filter/filter.hpp @@ -399,11 +399,11 @@ class Filter { return; } - // Create a map to collect IDs for each filter - std::unordered_map> filter_to_ids; - filter_to_ids.reserve(id_filter_pairs.size()); - std::vector pending_numeric; - pending_numeric.reserve(id_filter_pairs.size()); + // Create a map to collect IDs for each label filter + std::unordered_map> label_filter_to_ids; + label_filter_to_ids.reserve(id_filter_pairs.size()); + std::vector numeric_filter_entries; + numeric_filter_entries.reserve(id_filter_pairs.size()); // Group IDs by filter for(const auto& [numeric_id, filter_json] : id_filter_pairs) { @@ -432,7 +432,7 @@ class Filter { if(value.is_string()) { std::string filter_key = format_filter_key(field, value.get()); - filter_to_ids[filter_key].emplace_back(numeric_id); + label_filter_to_ids[filter_key].emplace_back(numeric_id); } else if(value.is_number()) { uint32_t sortable_val; if(value.is_number_integer()) { @@ -440,11 +440,11 @@ class Filter { } else { sortable_val = ndd::filter::float_to_sortable(value.get()); } - pending_numeric.emplace_back(field, numeric_id, sortable_val); + numeric_filter_entries.emplace_back(field, numeric_id, sortable_val); } else if(value.is_boolean()) { std::string filter_key = format_filter_key(field, value.get() ? "1" : "0"); - filter_to_ids[filter_key].emplace_back(numeric_id); + label_filter_to_ids[filter_key].emplace_back(numeric_id); } else { LOG_WARN(1203, index_id_, @@ -465,12 +465,12 @@ class Filter { * one for numeric_index and other for labels. */ - if(!pending_numeric.empty()) { - numeric_index_->put_batch(pending_numeric); + if(!numeric_filter_entries.empty()) { + numeric_index_->put_batch(numeric_filter_entries); } // Process each filter with its batch of IDs - for(const auto& [filter_key, ids] : filter_to_ids) { + for(const auto& [filter_key, ids] : label_filter_to_ids) { add_to_filter_batch(filter_key, ids); } } From 6643ccf881c3b36c577169f643dff5530a5baed5 Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Fri, 24 Apr 2026 11:48:30 +0000 Subject: [PATCH 10/28] docs updated for understanding --- docs/filter.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/filter.md b/docs/filter.md index 3f4b6ce5e4..fa1fa45bac 100644 --- a/docs/filter.md +++ b/docs/filter.md @@ -25,7 +25,9 @@ The system prioritizes **Pre-Filtering** followed by an adaptive search executio *Optimized for range queries, high compression, and sequential access.* ### 2.1. Storage Architecture (Hybrid Bucket) -The database (LMDB) acts as a coarse-grained B+ Tree. NumericIndex opens two MDBX named databases: "numeric_forward" and "numeric_inverted" +The database (LMDB) acts as a coarse-grained B+ Tree. NumericIndex opens two MDBX named databases: "numeric_forward" and "numeric_inverted". + +In numeric_inverted * **Key:** `[FieldID] + [Base_Value_32bit]`. * Floats are mapped to lexicographically ordered integers to preserve sort order. * Keys are stored in Big-Endian to support native cursor iteration. @@ -35,6 +37,13 @@ The database (LMDB) acts as a coarse-grained B+ Tree. NumericIndex opens two MDB * **Values:** Compressed as `uint16_t` deltas relative to the Key's `Base_Value`. * **IDs:** Raw `idInt` array, index-aligned with values. +In numeric_forward +* **Key:** `[field string]:[4-byte big-endian integer from values]` + * Floats are mapped to lexicographically ordered integers to preserve sort order. + * Keys are stored in Big-Endian to support native cursor iteration +* **Value + + ### 2.2. Query Execution * **Buckets Fully Inside Selection (Middle):** Use **Summary Bitmap**. Zero array access. * **Buckets Partially Overlapping (Edges):** Scan `Values` array (SIMD), use indices to fetch specific `IDs`. From 0a6697bac22cb1a59ad364081b9a69830a0984c0 Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Thu, 30 Apr 2026 05:48:02 +0000 Subject: [PATCH 11/28] timing function to time individual components of filterd search --- src/core/ndd.hpp | 47 ++++++++---- src/main.cpp | 1 + src/utils/search_timing.hpp | 140 ++++++++++++++++++++++++++++++++++++ 3 files changed, 176 insertions(+), 12 deletions(-) create mode 100644 src/utils/search_timing.hpp diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index 2700a688e5..17447d64e4 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -13,6 +13,7 @@ #include "msgpack_ndd.hpp" #include "quant_vector.hpp" #include "wal.hpp" +#include "../utils/search_timing.hpp" #include "../quant/dispatch.hpp" #include #include @@ -1485,6 +1486,7 @@ class IndexManager { float kDenseRrfWeight = settings::DEFAULT_DENSE_RRF_WEIGHT, float kRrfRankConstant = settings::DEFAULT_RRF_RANK_CONSTANT) { + ndd::ScopedSearchTiming search_total_timer(ndd::searchTimingStats().search_total); const float kSparseRrfWeight = 1.0f - kDenseRrfWeight; try { auto entry_ptr = getIndexEntry(index_id); @@ -1512,6 +1514,8 @@ class IndexManager { // 0. Compute Filter Bitmap (Shared) std::optional active_filter_bitmap; if (!filter_array.empty()) { + ndd::ScopedSearchTiming filter_bitmap_timer( + ndd::searchTimingStats().filter_bitmap_compute); active_filter_bitmap = entry.vector_storage->filter_store_->computeFilterBitmap(filter_array); } const ndd::RoaringBitmap* filter_ptr = @@ -1569,29 +1573,48 @@ class IndexManager { if (card == 0) { // No results match filter } else if (card < params.prefilter_threshold) { - // Strategy A: Brute Force on Small Subset + ndd::ScopedSearchTiming prefilter_total_timer( + ndd::searchTimingStats().prefilter_total); + ndd::recordPrefilterCardinality(card); + + // Strategy A: Brute Force on Small Subset std::vector valid_ids; - valid_ids.reserve(card); - bitmap.iterate([](ndd::idInt id, void* ptr){ - static_cast*>(ptr)->push_back(id); - return true; - }, &valid_ids); + { + ndd::ScopedSearchTiming bitmap_to_ids_timer( + ndd::searchTimingStats().prefilter_bitmap_to_ids); + valid_ids.reserve(card); + bitmap.iterate( + [](ndd::idInt id, void* ptr) { + static_cast*>(ptr)->push_back(id); + return true; + }, + &valid_ids); + } - // Fetch vectors - auto vector_batch = entry.vector_storage->get_vectors_batch(valid_ids); + std::vector>> vector_batch; + { + ndd::ScopedSearchTiming mdbx_get_timer( + ndd::searchTimingStats().prefilter_mdbx_get); + vector_batch = entry.vector_storage->get_vectors_batch(valid_ids); + } - // Prepare subset for bruteforce search std::vector>> vector_subset; vector_subset.reserve(vector_batch.size()); for(auto& [nid, vbytes] : vector_batch) { vector_subset.emplace_back(nid, std::move(vbytes)); } - dense_results = hnswlib::searchKnnSubset( - query_bytes.data(), vector_subset, k, space); + { + ndd::ScopedSearchTiming distance_compute_timer( + ndd::searchTimingStats().prefilter_distance_compute); + dense_results = hnswlib::searchKnnSubset( + query_bytes.data(), vector_subset, k, space); + } } else { // Strategy B: Filtered HNSW Search + ndd::ScopedSearchTiming filtered_hnsw_timer( + ndd::searchTimingStats().filtered_hnsw_search); BitMapFilterFunctor functor(bitmap); size_t effective_ef = ef > 0 ? ef : settings::DEFAULT_EF_SEARCH; @@ -2270,4 +2293,4 @@ inline std::pair IndexManager::uploadBackup(const std::string backup_store_.writeBackupJson(username, backup_db); return {true, "Backup uploaded successfully"}; -} \ No newline at end of file +} diff --git a/src/main.cpp b/src/main.cpp index 4654a54c20..28918be54f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -261,6 +261,7 @@ int main(int argc, char** argv) { {{"status", "ok"}, {"timestamp", (std::int64_t)std::chrono::system_clock::now().time_since_epoch().count()}}); PRINT_LOG_TIME(); + ndd::printSearchTimingStats(); ndd::printSparseSearchDebugStats(); ndd::printSparseUpdateDebugStats(); print_mdbx_stats(); diff --git a/src/utils/search_timing.hpp b/src/utils/search_timing.hpp new file mode 100644 index 0000000000..787bcf5491 --- /dev/null +++ b/src/utils/search_timing.hpp @@ -0,0 +1,140 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace ndd { + inline constexpr bool SEARCH_TIMING_ENABLED = false; + + struct SearchTimingCounter { + std::atomic calls{0}; + std::atomic total_ns{0}; + }; + + struct SearchTimingStats { + SearchTimingCounter search_total; + SearchTimingCounter filter_bitmap_compute; + SearchTimingCounter filtered_hnsw_search; + SearchTimingCounter prefilter_total; + SearchTimingCounter prefilter_bitmap_to_ids; + SearchTimingCounter prefilter_mdbx_get; + SearchTimingCounter prefilter_distance_compute; + std::atomic prefilter_cardinality_total{0}; + std::atomic prefilter_cardinality_max{0}; + }; + + inline SearchTimingStats& searchTimingStats() { + static SearchTimingStats stats; + return stats; + } + + inline timespec searchTimingNow() { + timespec ts{}; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts; + } + + inline uint64_t searchTimingElapsedNs(const timespec& start, const timespec& end) { + const uint64_t start_ns = + static_cast(start.tv_sec) * 1'000'000'000ULL + + static_cast(start.tv_nsec); + const uint64_t end_ns = + static_cast(end.tv_sec) * 1'000'000'000ULL + + static_cast(end.tv_nsec); + return end_ns >= start_ns ? end_ns - start_ns : 0; + } + + inline void addSearchTiming(SearchTimingCounter& counter, uint64_t elapsed_ns) { + if constexpr(SEARCH_TIMING_ENABLED) { + counter.calls.fetch_add(1, std::memory_order_relaxed); + counter.total_ns.fetch_add(elapsed_ns, std::memory_order_relaxed); + } + } + + class ScopedSearchTiming { + public: + explicit ScopedSearchTiming(SearchTimingCounter& counter) : + counter_(SEARCH_TIMING_ENABLED ? &counter : nullptr) { + if constexpr(SEARCH_TIMING_ENABLED) { + start_ = searchTimingNow(); + } + } + + ~ScopedSearchTiming() { + if constexpr(SEARCH_TIMING_ENABLED) { + addSearchTiming(*counter_, + searchTimingElapsedNs(start_, searchTimingNow())); + } + } + + private: + SearchTimingCounter* counter_{nullptr}; + timespec start_{}; + }; + + inline void recordPrefilterCardinality(size_t cardinality) { + if constexpr(!SEARCH_TIMING_ENABLED) { + return; + } + SearchTimingStats& stats = searchTimingStats(); + stats.prefilter_cardinality_total.fetch_add(static_cast(cardinality), + std::memory_order_relaxed); + + uint64_t current_max = + stats.prefilter_cardinality_max.load(std::memory_order_relaxed); + const uint64_t card = static_cast(cardinality); + while(card > current_max + && !stats.prefilter_cardinality_max.compare_exchange_weak( + current_max, card, std::memory_order_relaxed)) { + } + } + + inline void printSearchTimingStats() { + if constexpr(!SEARCH_TIMING_ENABLED) { + return; + } + SearchTimingStats& stats = searchTimingStats(); + + auto print_counter = [](const char* name, SearchTimingCounter& counter) -> uint64_t { + const uint64_t calls = counter.calls.exchange(0, std::memory_order_relaxed); + const uint64_t total_ns = counter.total_ns.exchange(0, std::memory_order_relaxed); + const double total_ms = static_cast(total_ns) / 1'000'000.0; + const double avg_ms = calls ? total_ms / static_cast(calls) : 0.0; + std::cerr << name << " count: " << calls << '\n'; + std::cerr << name << " total(ms): " + << std::fixed << std::setprecision(3) << total_ms << '\n'; + std::cerr << name << " avg(ms): " + << std::fixed << std::setprecision(3) << avg_ms << '\n'; + return calls; + }; + + std::cerr << "Search timing stats since last healthcheck\n"; + print_counter("search_total", stats.search_total); + print_counter("filter_bitmap_compute", stats.filter_bitmap_compute); + print_counter("filtered_hnsw_search", stats.filtered_hnsw_search); + const uint64_t prefilter_calls = print_counter("prefilter_total", stats.prefilter_total); + print_counter("prefilter_bitmap_to_ids", stats.prefilter_bitmap_to_ids); + print_counter("prefilter_mdbx_get", stats.prefilter_mdbx_get); + print_counter("prefilter_distance_compute", stats.prefilter_distance_compute); + + const uint64_t cardinality_total = + stats.prefilter_cardinality_total.exchange(0, std::memory_order_relaxed); + const uint64_t cardinality_max = + stats.prefilter_cardinality_max.exchange(0, std::memory_order_relaxed); + std::cerr << "prefilter_cardinality total: " << cardinality_total << '\n'; + std::cerr << "prefilter_cardinality max: " << cardinality_max << '\n'; + std::cerr << "prefilter_cardinality avg: " + << std::fixed << std::setprecision(3) + << (prefilter_calls + ? static_cast(cardinality_total) + / static_cast(prefilter_calls) + : 0.0) + << '\n'; + std::cerr << "=================================\n"; + } + +} // namespace ndd From d51372d652ee4a96d5c58fa95519bf8274e52ce0 Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Thu, 30 Apr 2026 07:53:16 +0000 Subject: [PATCH 12/28] no need to copy data from mdbx --- src/core/ndd.hpp | 45 ++++++++++++++++++++++------------ src/storage/vector_storage.hpp | 44 +++++++++++++++++++++++++++++++++ src/utils/search_timing.hpp | 2 ++ 3 files changed, 75 insertions(+), 16 deletions(-) diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index 17447d64e4..f31510622b 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -1591,24 +1592,36 @@ class IndexManager { &valid_ids); } - std::vector>> vector_batch; { - ndd::ScopedSearchTiming mdbx_get_timer( - ndd::searchTimingStats().prefilter_mdbx_get); - vector_batch = entry.vector_storage->get_vectors_batch(valid_ids); - } - - std::vector>> vector_subset; - vector_subset.reserve(vector_batch.size()); - for(auto& [nid, vbytes] : vector_batch) { - vector_subset.emplace_back(nid, std::move(vbytes)); - } + ndd::ScopedSearchTiming direct_score_timer( + ndd::searchTimingStats().prefilter_direct_mdbx_score); + auto distance_func = space->get_dist_func(); + void* dist_func_param = space->get_dist_func_param(); + std::priority_queue> top_results; + + if(k > 0) { + entry.vector_storage->visit_vectors_by_ids( + valid_ids, + [&](ndd::idInt numeric_id, const void* vector_data) { + float distance = distance_func(query_bytes.data(), + vector_data, + dist_func_param); + + if(top_results.size() < k) { + top_results.emplace(distance, numeric_id); + } else if(distance < top_results.top().first) { + top_results.pop(); + top_results.emplace(distance, numeric_id); + } + }); + } - { - ndd::ScopedSearchTiming distance_compute_timer( - ndd::searchTimingStats().prefilter_distance_compute); - dense_results = hnswlib::searchKnnSubset( - query_bytes.data(), vector_subset, k, space); + dense_results.reserve(top_results.size()); + while(!top_results.empty()) { + dense_results.push_back(top_results.top()); + top_results.pop(); + } + std::reverse(dense_results.begin(), dense_results.end()); } } else { diff --git a/src/storage/vector_storage.hpp b/src/storage/vector_storage.hpp index 8ca7f56ab9..e5430be930 100644 --- a/src/storage/vector_storage.hpp +++ b/src/storage/vector_storage.hpp @@ -13,6 +13,7 @@ #include #include #include +#include // Handles vector storage class VectorStore { @@ -339,6 +340,40 @@ class VectorStore { } } + template + size_t visit_vectors_by_ids(const std::vector& numeric_ids, + Visitor&& visitor) const { + if(numeric_ids.empty()) { + return 0; + } + + MDBX_txn* txn; + int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn); + if(rc != MDBX_SUCCESS) { + throw std::runtime_error(std::string("Failed to begin transaction: ") + mdbx_strerror(rc)); + } + + size_t visited = 0; + try { + for(const auto& numeric_id : numeric_ids) { + MDBX_val key{const_cast(&numeric_id), sizeof(ndd::idInt)}; + MDBX_val data; + + rc = mdbx_get(txn, dbi_, &key, &data); + if(rc == MDBX_SUCCESS && data.iov_len == bytes_per_vector_) { + visitor(numeric_id, static_cast(data.iov_base)); + visited++; + } + } + + mdbx_txn_abort(txn); + return visited; + } catch(...) { + mdbx_txn_abort(txn); + throw; + } + } + void remove(ndd::idInt numeric_id) { MDBX_txn* txn; int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); @@ -745,6 +780,15 @@ class VectorStorage { get_vectors_batch(const std::vector& numeric_ids) const { return vector_store_->get_vectors_batch(numeric_ids); } + + template + size_t visit_vectors_by_ids(const std::vector& numeric_ids, + Visitor&& visitor) const { + return vector_store_->visit_vectors_by_ids( + numeric_ids, + std::forward(visitor)); + } + ndd::VectorMeta get_meta(ndd::idInt numeric_id) const { return meta_store_->get_meta(numeric_id); } diff --git a/src/utils/search_timing.hpp b/src/utils/search_timing.hpp index 787bcf5491..cad40fe5f9 100644 --- a/src/utils/search_timing.hpp +++ b/src/utils/search_timing.hpp @@ -21,6 +21,7 @@ namespace ndd { SearchTimingCounter filtered_hnsw_search; SearchTimingCounter prefilter_total; SearchTimingCounter prefilter_bitmap_to_ids; + SearchTimingCounter prefilter_direct_mdbx_score; SearchTimingCounter prefilter_mdbx_get; SearchTimingCounter prefilter_distance_compute; std::atomic prefilter_cardinality_total{0}; @@ -118,6 +119,7 @@ namespace ndd { print_counter("filtered_hnsw_search", stats.filtered_hnsw_search); const uint64_t prefilter_calls = print_counter("prefilter_total", stats.prefilter_total); print_counter("prefilter_bitmap_to_ids", stats.prefilter_bitmap_to_ids); + print_counter("prefilter_direct_mdbx_score", stats.prefilter_direct_mdbx_score); print_counter("prefilter_mdbx_get", stats.prefilter_mdbx_get); print_counter("prefilter_distance_compute", stats.prefilter_distance_compute); From 29a60a4f21162dcb07a842be7c021a90ada2005a Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Mon, 4 May 2026 13:15:08 +0000 Subject: [PATCH 13/28] using return type OperationResult to propagate the logs --- src/core/ndd.hpp | 194 +++++-- src/filter/category_index.hpp | 275 +++++++--- src/filter/filter.hpp | 927 +++++++++++++++++++++------------ src/filter/numeric_index.hpp | 924 +++++++++++++++++++------------- src/main.cpp | 139 ++++- src/storage/vector_storage.hpp | 106 +++- src/utils/types.hpp | 8 +- tests/filter_test.cpp | 120 +++-- 8 files changed, 1800 insertions(+), 893 deletions(-) diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index f31510622b..20a1db2068 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -990,8 +990,18 @@ class IndexManager { entry->alg = std::move(new_alg); } + /* + * Adds or updates a batch of vectors and their associated filter documents. + * + * Return codes: + * 0 = success; value is true when vectors were inserted and false for an empty batch + * 1-99 = propagated filter validation failure from vector storage + * 100-199 = storage, sparse, or propagated filter storage failure; caller should return HTTP 500 + * 200-299 = propagated filter corruption/invariant failure; caller should return HTTP 500 + */ template - bool addVectors(const std::string& index_id, const std::vector& vectors) { + ndd::OperationResult addVectors(const std::string& index_id, + const std::vector& vectors) { try { // Get the index entry (loads if needed, handles all locking) auto entry_ptr = getIndexEntry(index_id); @@ -1004,7 +1014,7 @@ class IndexManager { LOG_DEBUG("Adding " << vectors.size() << " vectors to index " << index_id); if(vectors.empty()) { LOG_DEBUG("No vectors to add"); - return false; + return {SUCCESS, "No vectors to add", false}; } // CRITICAL FIX: Pass WAL to create_ids_batch for atomic logging @@ -1070,7 +1080,7 @@ class IndexManager { index_id, "Failed to update sparse storage for batch size " << sparse_batch.size()); - return false; + return {100, "Failed to update sparse storage"}; } } } @@ -1102,7 +1112,15 @@ class IndexManager { // Copy QuantVectorObject for storage (we need to keep original for HNSW) storage_vectors.emplace_back(numeric_ids[i].first, quantized_vectors[i]); } - entry.vector_storage->store_vectors_batch(storage_vectors); + auto storage_result = entry.vector_storage->store_vectors_batch(storage_vectors); + if(!storage_result.ok()) { + if(storage_result.code < 100) { + LOG_WARN(1212, index_id, "Insert filters rejected: " << storage_result.message); + } else { + LOG_ERROR(1219, index_id, "Insert filters failed: " << storage_result.message); + } + return {storage_result.code, storage_result.message}; + } LOG_DEBUG("Stored " << storage_vectors.size() << " pre-quantized vectors in vector storage"); @@ -1162,7 +1180,7 @@ class IndexManager { } PRINT_LOG_TIME(); - return true; + return {SUCCESS, "", true}; } catch(const std::runtime_error& e) { // Re-throw runtime_error (includes backup-in-progress check) // so it can be caught by API layer and returned as proper JSON error @@ -1332,9 +1350,17 @@ class IndexManager { } } - // Delete vectors from id mapper, delete filter and mark as deleted in HNSW. Does not delete - // meta, vector data Meta and vector data will be overwritten when the id is reused - bool deleteVectorsByIds(CacheEntry& entry, const std::vector& numeric_ids) { + /* + * Deletes vectors from id mapper, filter indexes, sparse storage, and HNSW live set. + * + * Return codes: + * 0 = success + * 1-99 = propagated filter validation failure from vector storage + * 100-199 = storage or propagated filter storage failure; caller should return HTTP 500 + * 200-299 = propagated filter corruption/invariant failure; caller should return HTTP 500 + */ + ndd::OperationResult + deleteVectorsByIds(CacheEntry& entry, const std::vector& numeric_ids) { try { for(ndd::idInt numeric_id : numeric_ids) { auto meta = entry.vector_storage->get_meta(numeric_id); @@ -1346,9 +1372,23 @@ class IndexManager { continue; } // Remove the filter - entry.vector_storage->deleteFilter(numeric_id, meta.filter); - // Mark as deleted in HNSW index + auto filter_result = entry.vector_storage->deleteFilter(numeric_id, meta.filter); + if(!filter_result.ok()) { + if(filter_result.code < 100) { + LOG_WARN(1216, + entry.index_id, + "Delete-vector filter removal rejected: " + << filter_result.message); + } else { + LOG_ERROR(1217, + entry.index_id, + "Delete-vector filter removal failed: " + << filter_result.message); + } + return {filter_result.code, filter_result.message}; + } + // Mark as deleted in HNSW index entry.alg->markDelete(numeric_id); // Delete from sparse storage if hybrid index @@ -1362,14 +1402,24 @@ class IndexManager { // Mark the index as dirty entry.markDirty(); - return true; + return {SUCCESS, "", true}; } catch(const std::exception& e) { LOG_ERROR(2035, entry.index_id, "Failed to delete vectors: " << e.what()); - return false; + return {100, std::string("Failed to delete vectors: ") + e.what()}; } } - size_t deleteVectorsByFilter(const std::string& index_id, const nlohmann::json& filter_array) { + /* + * Deletes all vectors matching a filter query. + * + * Return codes: + * 0 = success; value is the number of deleted vectors + * 1-99 = propagated filter validation failure; caller should return HTTP 400 + * 100-199 = storage or propagated filter storage failure; caller should return HTTP 500 + * 200-299 = propagated filter corruption/invariant failure; caller should return HTTP 500 + */ + ndd::OperationResult + deleteVectorsByFilter(const std::string& index_id, const nlohmann::json& filter_array) { try { auto entry_ptr = getIndexEntry(index_id); auto& entry = *entry_ptr; @@ -1377,11 +1427,27 @@ class IndexManager { // Use per-index operation mutex to prevent concurrent operations std::unique_lock operation_lock(entry.operation_mutex); - auto numeric_ids = + auto numeric_ids_result = entry.vector_storage->filter_store_->getIdsMatchingFilter(filter_array); + if(!numeric_ids_result.ok()) { + if(numeric_ids_result.code < 100) { + LOG_WARN(1213, + index_id, + "Delete-by-filter rejected: " << numeric_ids_result.message); + } else { + LOG_ERROR(1214, + index_id, + "Delete-by-filter failed while computing filter: " + << numeric_ids_result.message); + } + return {numeric_ids_result.code, numeric_ids_result.message}; + } + + auto& numeric_ids = numeric_ids_result.value_or_throw(); LOG_DEBUG("Filter matched " << numeric_ids.size() << " vectors"); - if(deleteVectorsByIds(entry, numeric_ids)) { + auto delete_result = deleteVectorsByIds(entry, numeric_ids); + if(delete_result.ok() && delete_result.value_or_throw()) { // Check if we need to save based on WAL entry count after logging WriteAheadLog* wal = getOrCreateWAL(entry); if(wal->getEntryCount() >= persistence_config_.save_every_n_updates) { @@ -1389,22 +1455,34 @@ class IndexManager { << " updates"); saveIndexInternal(entry); } - return numeric_ids.size(); + return {SUCCESS, "", numeric_ids.size()}; } else { - return 0; + if(!delete_result.ok()) { + return {delete_result.code, delete_result.message}; + } + return {SUCCESS, "", static_cast(0)}; } } catch(const std::runtime_error& e) { // Re-throw runtime_error (includes backup-in-progress check) throw; } catch(const std::exception& e) { LOG_ERROR(2036, index_id, "Failed to delete vectors by filter: " << e.what()); - return 0; + return {100, std::string("Failed to delete vectors by filter: ") + e.what()}; } } - // Update filters for a batch of vectors - size_t updateFilters(const std::string& index_id, - const std::vector>& updates) { + /* + * Replaces filter documents for a batch of vectors. + * + * Return codes: + * 0 = success; value is the number of updated filters + * 1-99 = propagated filter validation failure; caller should return HTTP 400 + * 100-199 = storage or propagated filter storage failure; caller should return HTTP 500 + * 200-299 = propagated filter corruption/invariant failure; caller should return HTTP 500 + */ + ndd::OperationResult + updateFilters(const std::string& index_id, + const std::vector>& updates) { try { auto entry_ptr = getIndexEntry(index_id); auto& entry = *entry_ptr; @@ -1419,7 +1497,19 @@ class IndexManager { continue; } - entry.vector_storage->updateFilter(numeric_id, new_filter); + auto filter_result = entry.vector_storage->updateFilter(numeric_id, new_filter); + if(!filter_result.ok()) { + if(filter_result.code < 100) { + LOG_WARN(1215, + index_id, + "Update-filters rejected: " << filter_result.message); + } else { + LOG_ERROR(1218, + index_id, + "Update-filters failed: " << filter_result.message); + } + return {filter_result.code, filter_result.message}; + } updated_count++; } @@ -1427,20 +1517,27 @@ class IndexManager { entry.markDirty(); } - return updated_count; + return {SUCCESS, "", updated_count}; } catch(const std::runtime_error& e) { // Re-throw runtime_error (includes backup-in-progress check) throw; } catch(const std::exception& e) { LOG_ERROR(2037, index_id, "Failed to update filters: " << e.what()); - return 0; + return {100, std::string("Failed to update filters: ") + e.what()}; } } - // Delete a single vector by string ID - vector data will not be deleted. The meta and filter - // will be deleted and the vector will be marked as deleted in HNSW. The id will be put in the - // deleted_ids in id mapper and will be reused for new vectors - bool deleteVector(const std::string& index_id, const std::string& str_id) { + /* + * Deletes one vector by string id and removes its filter index entries. + * + * Return codes: + * 0 = success; value is false when the vector id does not exist + * 1-99 = propagated filter validation failure; caller should return HTTP 400 + * 100-199 = storage or propagated filter storage failure; caller should return HTTP 500 + * 200-299 = propagated filter corruption/invariant failure; caller should return HTTP 500 + */ + ndd::OperationResult deleteVector(const std::string& index_id, + const std::string& str_id) { try { auto entry_ptr = getIndexEntry(index_id); auto& entry = *entry_ptr; @@ -1450,12 +1547,12 @@ class IndexManager { size_t numeric_id = entry.id_mapper->get_id(str_id); if(numeric_id == 0) { - return false; + return {SUCCESS, "", false}; } - bool result = deleteVectorsByIds(entry, {static_cast(numeric_id)}); + auto result = deleteVectorsByIds(entry, {static_cast(numeric_id)}); // Check if we need to save based on WAL entry count after logging - if(result) { + if(result.ok() && result.value_or_throw()) { WriteAheadLog* wal = getOrCreateWAL(entry); if(wal->getEntryCount() >= persistence_config_.save_every_n_updates) { LOG_DEBUG("Saving index " << index_id << " after " << wal->getEntryCount() @@ -1470,11 +1567,20 @@ class IndexManager { throw; } catch(const std::exception& e) { LOG_ERROR(2038, index_id, "Failed to delete vector: " << e.what()); - return false; + return {100, std::string("Failed to delete vector: ") + e.what()}; } } - std::optional> + /* + * Searches an index with optional filter bitmap computation. + * + * Return codes: + * 0 = success + * 1-99 = propagated filter validation failure; caller should return HTTP 400 + * 100-199 = search or propagated filter storage failure; caller should return HTTP 500 + * 200-299 = propagated filter corruption/invariant failure; caller should return HTTP 500 + */ + ndd::OperationResult> searchKNN(const std::string& index_id, const std::vector& query, const std::vector& sparse_indices, @@ -1509,7 +1615,7 @@ class IndexManager { // Zero-weight sources cannot influence the final ranking, so skip their retrieval // work entirely. if(!run_dense_search && !run_sparse_search) { - return std::vector(); + return {SUCCESS, "", std::vector()}; } // 0. Compute Filter Bitmap (Shared) @@ -1517,7 +1623,19 @@ class IndexManager { if (!filter_array.empty()) { ndd::ScopedSearchTiming filter_bitmap_timer( ndd::searchTimingStats().filter_bitmap_compute); - active_filter_bitmap = entry.vector_storage->filter_store_->computeFilterBitmap(filter_array); + auto filter_result = + entry.vector_storage->filter_store_->computeFilterBitmap(filter_array); + if(!filter_result.ok()) { + if(filter_result.code < 100) { + LOG_WARN(1220, index_id, "Search filter rejected: " << filter_result.message); + } else { + LOG_ERROR(1221, + index_id, + "Search filter computation failed: " << filter_result.message); + } + return {filter_result.code, filter_result.message}; + } + active_filter_bitmap = std::move(filter_result.value_or_throw()); } const ndd::RoaringBitmap* filter_ptr = active_filter_bitmap ? &(*active_filter_bitmap) : nullptr; @@ -1660,7 +1778,7 @@ class IndexManager { std::vector> final_candidates; if(dense_results.empty() && sparse_results.empty()) { - return std::vector(); + return {SUCCESS, "", std::vector()}; } else if(sparse_results.empty()) { // Only dense results final_candidates.reserve(dense_results.size()); @@ -1756,10 +1874,10 @@ class IndexManager { if(results.size() > k) { results.resize(k); } - return results; + return {SUCCESS, "", std::move(results)}; } catch(const std::exception& e) { LOG_ERROR(2039, index_id, "Search failed: " << e.what()); - return std::nullopt; + return {100, std::string("Search failed: ") + e.what()}; } } diff --git a/src/filter/category_index.hpp b/src/filter/category_index.hpp index f898fa67c8..79d183882f 100644 --- a/src/filter/category_index.hpp +++ b/src/filter/category_index.hpp @@ -1,12 +1,13 @@ #pragma once +#include #include +#include #include -#include -#include + #include "mdbx/mdbx.h" -#include "../utils/log.hpp" #include "../core/types.hpp" +#include "../utils/types.hpp" namespace ndd { namespace filter { @@ -21,56 +22,65 @@ namespace ndd { return field + ":" + value; } - // Load bitmap from LMDB - ndd::RoaringBitmap get_bitmap_internal(const std::string& filter_key) const { - MDBX_txn* txn; + /* + * Loads the bitmap stored for a formatted category filter key. + * + * Return codes: + * 0 = success + * 100 = MDBX transaction or read failure; caller should log ERROR and return HTTP 500 + * 200 = corrupt stored bitmap payload; caller should log ERROR and return HTTP 500 + */ + ndd::OperationResult + get_bitmap_internal(const std::string& filter_key) const { + MDBX_txn* txn = nullptr; int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn); if(rc != MDBX_SUCCESS) { - throw std::runtime_error("Failed to begin read transaction: " - + std::string(mdbx_strerror(rc))); + return {100, + "Failed to begin category bitmap read transaction: " + + std::string(mdbx_strerror(rc))}; } - try { - MDBX_val key{const_cast(filter_key.c_str()), filter_key.size()}; - MDBX_val data; - - rc = mdbx_get(txn, dbi_, &key, &data); - if(rc == MDBX_NOTFOUND) { - mdbx_txn_abort(txn); - // LOG_DEBUG("Filter key not found: " << filter_key); - return ndd::RoaringBitmap(); // Return empty bitmap - } - if(rc != MDBX_SUCCESS) { - mdbx_txn_abort(txn); - throw std::runtime_error("Failed to read filter key '" + filter_key - + "': " + std::string(mdbx_strerror(rc))); - } + MDBX_val key{const_cast(filter_key.c_str()), filter_key.size()}; + MDBX_val data; - if(data.iov_len == 0) { - mdbx_txn_abort(txn); - // LOG_DEBUG("Empty data for filter key: " << filter_key); - return ndd::RoaringBitmap(); - } + rc = mdbx_get(txn, dbi_, &key, &data); + if(rc == MDBX_NOTFOUND || (rc == MDBX_SUCCESS && data.iov_len == 0)) { + mdbx_txn_abort(txn); + return {SUCCESS, "", ndd::RoaringBitmap()}; + } + if(rc != MDBX_SUCCESS) { + mdbx_txn_abort(txn); + return {100, + "Failed to read category bitmap key '" + filter_key + + "': " + std::string(mdbx_strerror(rc))}; + } + try { ndd::RoaringBitmap bitmap = ndd::RoaringBitmap::read(static_cast(data.iov_base)); mdbx_txn_abort(txn); - return bitmap; - } catch(...) { + return {SUCCESS, "", std::move(bitmap)}; + } catch(const std::exception& e) { mdbx_txn_abort(txn); - throw; + return {200, "Corrupt category bitmap payload for key '" + filter_key + + "': " + e.what()}; } } - void store_bitmap_internal(const std::string& filter_key, - const ndd::RoaringBitmap& bitmap) { - if(bitmap.cardinality() == 0) { - // LOG_DEBUG("Storing empty bitmap for key: " << filter_key); - } - + /* + * Stores the bitmap for a formatted category filter key. + * + * Return codes: + * 0 = success + * 100 = MDBX transaction or write failure; caller should log ERROR and return HTTP 500 + * 200 = invalid bitmap serialization; caller should log ERROR and return HTTP 500 + */ + ndd::OperationResult<> store_bitmap_internal(const std::string& filter_key, + const ndd::RoaringBitmap& bitmap) { size_t required_size = bitmap.getSizeInBytes(); if(required_size == 0) { - throw std::runtime_error("Invalid bitmap serialization: size is 0"); + return {200, "Invalid category bitmap serialization size for key '" + + filter_key + "'"}; } std::vector buffer(required_size); @@ -79,113 +89,212 @@ namespace ndd { MDBX_val key{const_cast(filter_key.c_str()), filter_key.size()}; MDBX_val data{const_cast(buffer.data()), buffer.size()}; - MDBX_txn* txn; + MDBX_txn* txn = nullptr; int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); if(rc != MDBX_SUCCESS) { - throw std::runtime_error("Failed to begin write transaction: " - + std::string(mdbx_strerror(rc))); + return {100, + "Failed to begin category bitmap write transaction: " + + std::string(mdbx_strerror(rc))}; } rc = mdbx_put(txn, dbi_, &key, &data, MDBX_UPSERT); if(rc != MDBX_SUCCESS) { mdbx_txn_abort(txn); - throw std::runtime_error("Failed to store bitmap: " - + std::string(mdbx_strerror(rc))); + return {100, "Failed to store category bitmap key '" + filter_key + + "': " + std::string(mdbx_strerror(rc))}; } rc = mdbx_txn_commit(txn); if(rc != MDBX_SUCCESS) { - throw std::runtime_error("Failed to commit transaction: " - + std::string(mdbx_strerror(rc))); + return {100, + "Failed to commit category bitmap write transaction: " + + std::string(mdbx_strerror(rc))}; } + return {SUCCESS, ""}; } public: CategoryIndex(MDBX_env* env) : env_(env) { - MDBX_txn* txn; + MDBX_txn* txn = nullptr; int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); if(rc != MDBX_SUCCESS) { - throw std::runtime_error(std::string("Failed to begin txn for CategoryIndex init: ") + mdbx_strerror(rc)); + throw std::runtime_error( + std::string("Failed to begin txn for CategoryIndex init: ") + + mdbx_strerror(rc)); } - // Open named DB for category/boolean rc = mdbx_dbi_open(txn, "category_idx", MDBX_CREATE, &dbi_); if(rc != MDBX_SUCCESS) { mdbx_txn_abort(txn); - throw std::runtime_error(std::string("Failed to open category_idx dbi: ") + mdbx_strerror(rc)); + throw std::runtime_error(std::string("Failed to open category_idx dbi: ") + + mdbx_strerror(rc)); } - mdbx_txn_commit(txn); + rc = mdbx_txn_commit(txn); + if(rc != MDBX_SUCCESS) { + throw std::runtime_error(std::string("Failed to commit CategoryIndex init: ") + + mdbx_strerror(rc)); + } } - // Faceting: List all unique values for a field - std::vector scan_values(const std::string& field) const { + /* + * Lists all unique category values stored for one field. + * + * Return codes: + * 0 = success + * 100 = MDBX transaction, cursor, or scan failure; caller should log ERROR and return HTTP 500 + */ + ndd::OperationResult> + scan_values(const std::string& field) const { std::vector values; - MDBX_txn* txn; - if (mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn) != MDBX_SUCCESS) return values; + MDBX_txn* txn = nullptr; + int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn); + if(rc != MDBX_SUCCESS) { + return {100, + "Failed to begin category value scan transaction: " + + std::string(mdbx_strerror(rc))}; + } - MDBX_cursor* cursor; - mdbx_cursor_open(txn, dbi_, &cursor); + MDBX_cursor* cursor = nullptr; + rc = mdbx_cursor_open(txn, dbi_, &cursor); + if(rc != MDBX_SUCCESS) { + mdbx_txn_abort(txn); + return {100, + "Failed to open category value scan cursor: " + + std::string(mdbx_strerror(rc))}; + } std::string prefix = field + ":"; MDBX_val key{const_cast(prefix.c_str()), prefix.size()}; MDBX_val data; - int rc = mdbx_cursor_get(cursor, &key, &data, MDBX_SET_RANGE); + rc = mdbx_cursor_get(cursor, &key, &data, MDBX_SET_RANGE); while(rc == MDBX_SUCCESS) { - std::string found_key((char*)key.iov_base, key.iov_len); - if(found_key.rfind(prefix, 0) != 0) break; + std::string found_key(static_cast(key.iov_base), key.iov_len); + if(found_key.rfind(prefix, 0) != 0) { + break; + } values.push_back(found_key.substr(prefix.size())); rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT); } + mdbx_cursor_close(cursor); mdbx_txn_abort(txn); - return values; + + if(rc != MDBX_SUCCESS && rc != MDBX_NOTFOUND) { + return {100, "Failed during category value scan: " + + std::string(mdbx_strerror(rc))}; + } + return {SUCCESS, "", std::move(values)}; } - ndd::RoaringBitmap get_bitmap(const std::string& field, - const std::string& value) const { + /* + * Loads the bitmap for one category field/value pair. + * + * Return codes: + * 0 = success + * 100-199 = propagated MDBX/storage failure from the bitmap read helper + * 200-299 = propagated corruption/invariant failure from the bitmap read helper + */ + ndd::OperationResult + get_bitmap(const std::string& field, const std::string& value) const { return get_bitmap_internal(format_filter_key(field, value)); } - // Direct key access for internal use if needed, or expose format_filter_key - ndd::RoaringBitmap get_bitmap_by_key(const std::string& key) const { + /* + * Loads the bitmap for an already formatted category key. + * + * Return codes: + * 0 = success + * 100-199 = propagated MDBX/storage failure from the bitmap read helper + * 200-299 = propagated corruption/invariant failure from the bitmap read helper + */ + ndd::OperationResult + get_bitmap_by_key(const std::string& key) const { return get_bitmap_internal(key); } - void add(const std::string& field, const std::string& value, ndd::idInt id) { + /* + * Adds one id to a category field/value bitmap. + * + * Return codes: + * 0 = success + * 100-199 = propagated MDBX/storage failure from bitmap read/write helpers + * 200-299 = propagated corruption/invariant failure from bitmap read/write helpers + */ + ndd::OperationResult<> + add(const std::string& field, const std::string& value, ndd::idInt id) { std::string filter_key = format_filter_key(field, value); - ndd::RoaringBitmap bitmap = get_bitmap_internal(filter_key); - bitmap.add(id); - store_bitmap_internal(filter_key, bitmap); + auto bitmap_result = get_bitmap_internal(filter_key); + if(!bitmap_result.ok()) { + return {bitmap_result.code, bitmap_result.message}; + } + + bitmap_result.value_or_throw().add(id); + return store_bitmap_internal(filter_key, bitmap_result.value_or_throw()); } - void remove(const std::string& field, const std::string& value, ndd::idInt id) { + /* + * Removes one id from a category field/value bitmap. + * + * Return codes: + * 0 = success + * 100-199 = propagated MDBX/storage failure from bitmap read/write helpers + * 200-299 = propagated corruption/invariant failure from bitmap read/write helpers + */ + ndd::OperationResult<> + remove(const std::string& field, const std::string& value, ndd::idInt id) { std::string filter_key = format_filter_key(field, value); - ndd::RoaringBitmap bitmap = get_bitmap_internal(filter_key); - bitmap.remove(id); - store_bitmap_internal(filter_key, bitmap); + auto bitmap_result = get_bitmap_internal(filter_key); + if(!bitmap_result.ok()) { + return {bitmap_result.code, bitmap_result.message}; + } + + bitmap_result.value_or_throw().remove(id); + return store_bitmap_internal(filter_key, bitmap_result.value_or_throw()); } - bool contains(const std::string& field, const std::string& value, ndd::idInt id) const { - std::string filter_key = format_filter_key(field, value); - ndd::RoaringBitmap bitmap = get_bitmap_internal(filter_key); - return bitmap.contains(id); + /* + * Checks whether one id is present in a category field/value bitmap. + * + * Return codes: + * 0 = success + * 100-199 = propagated MDBX/storage failure from the bitmap read helper + * 200-299 = propagated corruption/invariant failure from the bitmap read helper + */ + ndd::OperationResult + contains(const std::string& field, const std::string& value, ndd::idInt id) const { + auto bitmap_result = get_bitmap_internal(format_filter_key(field, value)); + if(!bitmap_result.ok()) { + return {bitmap_result.code, bitmap_result.message}; + } + return {SUCCESS, "", bitmap_result.value_or_throw().contains(id)}; } - // Helper for batch operations where key is already formatted - void add_batch_by_key(const std::string& key, const std::vector& ids) { + /* + * Adds a batch of ids to an already formatted category key. + * + * Return codes: + * 0 = success + * 100-199 = propagated MDBX/storage failure from bitmap read/write helpers + * 200-299 = propagated corruption/invariant failure from bitmap read/write helpers + */ + ndd::OperationResult<> + add_batch_by_key(const std::string& key, const std::vector& ids) { if(ids.empty()) { - return; + return {SUCCESS, ""}; } - ndd::RoaringBitmap bitmap = get_bitmap_internal(key); - bitmap.addMany(ids.size(), ids.data()); - store_bitmap_internal(key, bitmap); + auto bitmap_result = get_bitmap_internal(key); + if(!bitmap_result.ok()) { + return {bitmap_result.code, bitmap_result.message}; + } + + bitmap_result.value_or_throw().addMany(ids.size(), ids.data()); + return store_bitmap_internal(key, bitmap_result.value_or_throw()); } - // Expose key formatting for external batching logic static std::string make_key(const std::string& field, const std::string& value) { return format_filter_key(field, value); } diff --git a/src/filter/filter.hpp b/src/filter/filter.hpp index 5d581e47b4..3870ff1dd5 100644 --- a/src/filter/filter.hpp +++ b/src/filter/filter.hpp @@ -1,47 +1,48 @@ #pragma once -// System includes -#include +#include +#include +#include #include +#include #include -#include -#include -#include -#include +#include #include +#include #include #include "json/nlohmann_json.hpp" -#include "../utils/settings.hpp" #include "mdbx/mdbx.h" -#include "../utils/log.hpp" #include "../core/types.hpp" -#include "../hnsw/hnswlib.h" // For BaseFilterFunctor +#include "../hnsw/hnswlib.h" +#include "../utils/log.hpp" +#include "../utils/settings.hpp" +#include "../utils/types.hpp" -#include "numeric_index.hpp" #include "category_index.hpp" +#include "numeric_index.hpp" enum class FieldType : uint8_t { Unknown = 0, String = 1, - Number = 2, // Unified Integer and Float + Number = 2, Bool = 4 }; -// Filter Functor for HNSW class BitMapFilterFunctor : public hnswlib::BaseFilterFunctor { const ndd::RoaringBitmap& bitmap_; + public: - BitMapFilterFunctor(const ndd::RoaringBitmap& bitmap) : bitmap_(bitmap) {} - bool operator()(ndd::idInt id) override { - return bitmap_.contains(id); - } + BitMapFilterFunctor(const ndd::RoaringBitmap& bitmap) : + bitmap_(bitmap) {} + + bool operator()(ndd::idInt id) override { return bitmap_.contains(id); } }; class Filter { private: MDBX_env* env_; - MDBX_dbi dbi_; // Used for schema storage + MDBX_dbi dbi_; std::string index_id_; std::string path_; std::unique_ptr numeric_index_; @@ -51,129 +52,228 @@ class Filter { std::unordered_map schema_cache_; mutable std::mutex schema_mutex_; - void load_schema() { - MDBX_txn* txn; + /* + * Loads the persisted filter schema into the in-memory schema cache. + * + * Return codes: + * 0 = success + * 100 = MDBX transaction or read failure; caller should log ERROR and return HTTP 500 + * 200 = corrupt schema JSON payload; caller should log ERROR and return HTTP 500 + */ + ndd::OperationResult<> load_schema() { + MDBX_txn* txn = nullptr; int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn); if(rc != MDBX_SUCCESS) { - LOG_ERROR( - 1210, index_id_, "Failed to begin schema read transaction: " << mdbx_strerror(rc)); - return; + return {100, "Failed to begin schema read transaction: " + + std::string(mdbx_strerror(rc))}; } - MDBX_val key{const_cast(SCHEMA_KEY), strlen(SCHEMA_KEY)}; + MDBX_val key{const_cast(SCHEMA_KEY), std::strlen(SCHEMA_KEY)}; MDBX_val data; rc = mdbx_get(txn, dbi_, &key, &data); - if(rc == MDBX_SUCCESS && data.iov_len > 0) { - try { - std::string json_str(static_cast(data.iov_base), data.iov_len); - auto j = nlohmann::json::parse(json_str); - std::lock_guard lock(schema_mutex_); - for(auto& [k, v] : j.items()) { - schema_cache_[k] = static_cast(v.get()); - } - } catch(...) { - LOG_ERROR(1201, index_id_, "Failed to load filter schema"); + if(rc == MDBX_NOTFOUND || (rc == MDBX_SUCCESS && data.iov_len == 0)) { + mdbx_txn_abort(txn); + return {SUCCESS, ""}; + } + if(rc != MDBX_SUCCESS) { + mdbx_txn_abort(txn); + return {100, "Failed to read filter schema: " + std::string(mdbx_strerror(rc))}; + } + + try { + std::string json_str(static_cast(data.iov_base), data.iov_len); + auto parsed = nlohmann::json::parse(json_str); + std::lock_guard lock(schema_mutex_); + schema_cache_.clear(); + for(auto& [field, stored_type] : parsed.items()) { + schema_cache_[field] = static_cast(stored_type.get()); } + } catch(const std::exception& e) { + mdbx_txn_abort(txn); + return {200, "Failed to parse filter schema: " + std::string(e.what())}; } + mdbx_txn_abort(txn); + return {SUCCESS, ""}; } - void save_schema_internal() { - nlohmann::json j; - for(const auto& [k, v] : schema_cache_) { - j[k] = static_cast(v); + /* + * Persists the current in-memory filter schema cache. + * + * Return codes: + * 0 = success + * 100 = MDBX transaction, write, or commit failure; caller should log ERROR and return HTTP 500 + */ + ndd::OperationResult<> save_schema_internal() { + nlohmann::json schema_json; + for(const auto& [field, type] : schema_cache_) { + schema_json[field] = static_cast(type); } - std::string json_str = j.dump(); + std::string json_str = schema_json.dump(); - MDBX_txn* txn; + MDBX_txn* txn = nullptr; int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); if(rc != MDBX_SUCCESS) { - LOG_ERROR( - 1208, index_id_, "Failed to begin schema write transaction: " << mdbx_strerror(rc)); - return; + return {100, "Failed to begin schema write transaction: " + + std::string(mdbx_strerror(rc))}; } - MDBX_val key{const_cast(SCHEMA_KEY), strlen(SCHEMA_KEY)}; + MDBX_val key{const_cast(SCHEMA_KEY), std::strlen(SCHEMA_KEY)}; MDBX_val data{const_cast(json_str.c_str()), json_str.size()}; rc = mdbx_put(txn, dbi_, &key, &data, MDBX_UPSERT); - if(rc == MDBX_SUCCESS) { - rc = mdbx_txn_commit(txn); - if(rc != MDBX_SUCCESS) { - LOG_ERROR( - 1209, index_id_, "Failed to commit filter schema update: " << mdbx_strerror(rc)); - } - } else { + if(rc != MDBX_SUCCESS) { mdbx_txn_abort(txn); - LOG_ERROR(1211, index_id_, "Failed to persist filter schema: " << mdbx_strerror(rc)); + return {100, "Failed to persist filter schema: " + + std::string(mdbx_strerror(rc))}; + } + + rc = mdbx_txn_commit(txn); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to commit filter schema update: " + + std::string(mdbx_strerror(rc))}; } + return {SUCCESS, ""}; } - bool register_field_type(const std::string& field, FieldType type) { + /* + * Registers a field type in the filter schema if it is not already present. + * + * Return codes: + * 0 = success + * 3 = field type mismatch with existing schema; caller should return HTTP 400 + * 100-199 = propagated MDBX/storage failure from schema persistence + */ + ndd::OperationResult<> register_field_type(const std::string& field, FieldType type) { std::lock_guard lock(schema_mutex_); auto it = schema_cache_.find(field); if(it != schema_cache_.end()) { - return it->second == type; + if(it->second == type) { + return {SUCCESS, ""}; + } + return {3, "Filter field '" + field + "' has a different existing type"}; } schema_cache_[field] = type; - save_schema_internal(); - return true; + auto save_result = save_schema_internal(); + if(!save_result.ok()) { + schema_cache_.erase(field); + return save_result; + } + return {SUCCESS, ""}; + } + + /* + * Converts a JSON number into the current sortable numeric filter encoding. + * + * Return codes: + * 0 = success + * 2 = value is not numeric; caller should return HTTP 400 + */ + static ndd::OperationResult sortable_from_json(const nlohmann::json& value, + const std::string& context) { + if(value.is_number_integer()) { + return {SUCCESS, "", ndd::filter::int_to_sortable(value.get())}; + } + if(value.is_number()) { + return {SUCCESS, "", ndd::filter::float_to_sortable(value.get())}; + } + return {2, context + " must be a number"}; + } + + /* + * Converts a JSON scalar into the category key value representation. + * + * Return codes: + * 0 = success + * 2 = value is not a supported category scalar or is too long; caller should return HTTP 400 + */ + static ndd::OperationResult category_value_from_json(const nlohmann::json& value, + const std::string& context) { + std::string str_val; + if(value.is_string()) { + str_val = value.get(); + } else if(value.is_boolean()) { + str_val = value.get() ? "1" : "0"; + } else if(value.is_number_integer()) { + str_val = std::to_string(value.get()); + } else { + return {2, context + " must be string, integer, or boolean"}; + } + + if(str_val.size() > 255) { + return {2, context + " is too long"}; + } + return {SUCCESS, "", std::move(str_val)}; + } + + static std::string format_filter_key(const std::string& field, const std::string& value) { + return field + ":" + value; } void init_environment() { int rc = mdbx_env_create(&env_); - if(rc != 0) { - throw std::runtime_error(std::string("Failed to create LMDB env for filters: ") + mdbx_strerror(rc)); - } - // max DBs to allow multiple databases (main + schema + numeric_forward + numeric_inverted) - mdbx_env_set_maxdbs(env_, 10); - - // Set geometry for auto-grow using the filter map size settings - rc = mdbx_env_set_geometry( - env_, - -1, // lower size bound (use default) - 1ULL << settings::FILTER_MAP_SIZE_BITS, // current/now size - 1ULL << settings::FILTER_MAP_SIZE_MAX_BITS, // upper size bound - 1ULL << settings::FILTER_MAP_SIZE_BITS, // growth step - -1, // shrink threshold (use default) - -1); // pagesize (use default) if(rc != MDBX_SUCCESS) { - throw std::runtime_error(std::string("Failed to set geometry for filters: ") + mdbx_strerror(rc)); + throw std::runtime_error(std::string("Failed to create LMDB env for filters: ") + + mdbx_strerror(rc)); + } + + rc = mdbx_env_set_maxdbs(env_, 10); + if(rc != MDBX_SUCCESS) { + throw std::runtime_error(std::string("Failed to configure max DBs for filters: ") + + mdbx_strerror(rc)); + } + + rc = mdbx_env_set_geometry(env_, + -1, + 1ULL << settings::FILTER_MAP_SIZE_BITS, + 1ULL << settings::FILTER_MAP_SIZE_MAX_BITS, + 1ULL << settings::FILTER_MAP_SIZE_BITS, + -1, + -1); + if(rc != MDBX_SUCCESS) { + throw std::runtime_error(std::string("Failed to set geometry for filters: ") + + mdbx_strerror(rc)); } - rc = mdbx_env_open( - env_, path_.c_str(), MDBX_WRITEMAP | MDBX_MAPASYNC | MDBX_NORDAHEAD, 0664); - if(rc != 0) { - throw std::runtime_error(std::string("Failed to open filter environment: ") + mdbx_strerror(rc)); + rc = mdbx_env_open(env_, + path_.c_str(), + MDBX_WRITEMAP | MDBX_MAPASYNC | MDBX_NORDAHEAD, + 0664); + if(rc != MDBX_SUCCESS) { + throw std::runtime_error(std::string("Failed to open filter environment: ") + + mdbx_strerror(rc)); } - MDBX_txn* txn; + MDBX_txn* txn = nullptr; rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); - if(rc != 0) { - throw std::runtime_error(std::string("Failed to begin filter transaction: ") + mdbx_strerror(rc)); + if(rc != MDBX_SUCCESS) { + throw std::runtime_error(std::string("Failed to begin filter transaction: ") + + mdbx_strerror(rc)); } rc = mdbx_dbi_open(txn, nullptr, MDBX_CREATE, &dbi_); - if(rc != 0) { + if(rc != MDBX_SUCCESS) { mdbx_txn_abort(txn); - throw std::runtime_error(std::string("Failed to open filter database: ") + mdbx_strerror(rc)); + throw std::runtime_error(std::string("Failed to open filter database: ") + + mdbx_strerror(rc)); } + rc = mdbx_txn_commit(txn); - if(rc != 0) { - throw std::runtime_error(std::string("Failed to commit filter transaction: ") + mdbx_strerror(rc)); + if(rc != MDBX_SUCCESS) { + throw std::runtime_error(std::string("Failed to commit filter transaction: ") + + mdbx_strerror(rc)); } - // Initialize Indices numeric_index_ = std::make_unique(env_); category_index_ = std::make_unique(env_); - load_schema(); - } - - static std::string format_filter_key(const std::string& field, const std::string& value) { - return field + ":" + value; + auto schema_result = load_schema(); + if(!schema_result.ok()) { + LOG_ERROR(1201, index_id_, schema_result.message); + throw std::runtime_error(schema_result.message); + } } public: @@ -184,20 +284,32 @@ class Filter { init_environment(); } + Filter(const std::string& path) : + Filter(path, "-/-") {} + ~Filter() { mdbx_dbi_close(env_, dbi_); mdbx_env_close(env_); } - // Compute the filter bitmap based on the provided JSON filter array - ndd::RoaringBitmap computeFilterBitmap(const nlohmann::json& filter_array) const { + /* + * Computes the bitmap for an AND filter query. + * + * Return codes: + * 0 = success + * 1 = invalid filter query shape; caller should return HTTP 400 + * 2 = invalid operator or value for field type; caller should return HTTP 400 + * 100-199 = propagated MDBX/storage failure from category or numeric index + * 200-299 = propagated corruption/invariant failure from category or numeric index + */ + ndd::OperationResult + computeFilterBitmap(const nlohmann::json& filter_array) const { if(!filter_array.is_array()) { - throw std::runtime_error("Filter must be an array"); + return {1, "Filter must be an array"}; } if(filter_array.empty()) { - LOG_DEBUG("Empty filter array, returning empty bitmap"); - return ndd::RoaringBitmap(); + return {SUCCESS, "", ndd::RoaringBitmap()}; } std::vector partial_results; @@ -205,17 +317,18 @@ class Filter { for(const auto& condition : filter_array) { if(!condition.is_object() || condition.size() != 1) { - throw std::runtime_error("Each condition must be a single-field object"); + return {1, "Each filter condition must be a single-field object"}; } const auto& field = condition.begin().key(); const auto& expr = condition.begin().value(); - if(field.empty()) { - throw std::runtime_error("Filter field name cannot be empty"); + return {1, "Filter field name cannot be empty"}; + } + if(!expr.is_object() || expr.size() != 1) { + return {1, "Filter operator must be a single-field object"}; } - // Check schema for field type FieldType type = FieldType::Unknown; { std::lock_guard lock(schema_mutex_); @@ -225,371 +338,503 @@ class Filter { } } - ndd::RoaringBitmap or_result; - - if(!expr.is_object() || expr.size() != 1) { - throw std::runtime_error("Operator must be a single-field object"); - } - const std::string op = expr.begin().key(); const auto& val = expr.begin().value(); + ndd::RoaringBitmap or_result; if(op == "$eq") { if(type == FieldType::Number) { - uint32_t sortable_val; - if(val.is_number_integer()) { - sortable_val = ndd::filter::int_to_sortable(val.get()); - } else if(val.is_number()) { - sortable_val = ndd::filter::float_to_sortable(val.get()); - } else { - throw std::runtime_error("$eq value for numeric field must be a number"); + auto sortable_result = sortable_from_json(val, "$eq value for numeric field"); + if(!sortable_result.ok()) { + return {sortable_result.code, sortable_result.message}; + } + auto range_result = + numeric_index_->range(field, sortable_result.value_or_throw(), sortable_result.value_or_throw()); + if(!range_result.ok()) { + return {range_result.code, range_result.message}; } - or_result = numeric_index_->range(field, sortable_val, sortable_val); + or_result = std::move(range_result.value_or_throw()); } else { - if(!val.is_string() && !val.is_number_integer() && !val.is_boolean()) { - throw std::runtime_error("$eq value must be string, integer or boolean"); + auto value_result = category_value_from_json(val, "$eq value"); + if(!value_result.ok()) { + return {value_result.code, value_result.message}; } - std::string str_val; - if(val.is_string()) { - str_val = val.get(); - } else if(val.is_boolean()) { - str_val = val.get() ? "1" : "0"; - } else { - str_val = std::to_string(val.get()); - if (str_val.size() > 255) throw std::runtime_error("Category value too long"); + auto bitmap_result = category_index_->get_bitmap_by_key( + format_filter_key(field, value_result.value_or_throw())); + if(!bitmap_result.ok()) { + return {bitmap_result.code, bitmap_result.message}; } - std::string key = format_filter_key(field, str_val); - or_result = category_index_->get_bitmap_by_key(key); + or_result = std::move(bitmap_result.value_or_throw()); } } else if(op == "$in") { if(!val.is_array()) { - throw std::runtime_error("$in must be array"); + return {2, "$in must be an array"}; } - if(val.empty()) { - LOG_DEBUG("Empty $in array for field: " << field); - } else { - for(const auto& v : val) { - if(type == FieldType::Number) { - uint32_t sortable_val; - if(v.is_number_integer()) { - sortable_val = ndd::filter::int_to_sortable(v.get()); - } else if(v.is_number()) { - sortable_val = ndd::filter::float_to_sortable(v.get()); - } else { - throw std::runtime_error( - "$in value for numeric field must be a number"); - } - or_result |= numeric_index_->range(field, sortable_val, sortable_val); - } else { - if(!v.is_string() && !v.is_number_integer() && !v.is_boolean()) { - throw std::runtime_error( - "$in values must be string, integer or boolean"); - } - std::string str_val; - if(v.is_string()) { - str_val = v.get(); - } else if(v.is_boolean()) { - str_val = v.get() ? "1" : "0"; - } else { - str_val = std::to_string(v.get()); - } - if(!str_val.empty()) { - if (str_val.size() > 255) throw std::runtime_error("Category value too long"); - std::string key = format_filter_key(field, str_val); - or_result |= category_index_->get_bitmap_by_key(key); + + for(const auto& item : val) { + if(type == FieldType::Number) { + auto sortable_result = + sortable_from_json(item, "$in value for numeric field"); + if(!sortable_result.ok()) { + return {sortable_result.code, sortable_result.message}; + } + auto range_result = numeric_index_->range(field, + sortable_result.value_or_throw(), + sortable_result.value_or_throw()); + if(!range_result.ok()) { + return {range_result.code, range_result.message}; + } + or_result |= range_result.value_or_throw(); + } else { + auto value_result = category_value_from_json(item, "$in value"); + if(!value_result.ok()) { + return {value_result.code, value_result.message}; + } + if(!value_result.value_or_throw().empty()) { + auto bitmap_result = category_index_->get_bitmap_by_key( + format_filter_key(field, value_result.value_or_throw())); + if(!bitmap_result.ok()) { + return {bitmap_result.code, bitmap_result.message}; } + or_result |= bitmap_result.value_or_throw(); } } } } else if(op == "$range") { if(!val.is_array() || val.size() != 2) { - throw std::runtime_error( - "$range must be [start, end] array with exactly 2 elements"); + return {2, "$range must be [start, end] with exactly 2 values"}; + } + if(type != FieldType::Number) { + return {2, "$range operator is only supported for numeric fields"}; } - if(type == FieldType::Number) { - uint32_t start_val, end_val; - - if(val[0].is_number_integer()) { - start_val = ndd::filter::int_to_sortable(val[0].get()); - } else if(val[0].is_number()) { - start_val = ndd::filter::float_to_sortable(val[0].get()); - } else { - throw std::runtime_error("Range start must be a number"); - } - - if(val[1].is_number_integer()) { - end_val = ndd::filter::int_to_sortable(val[1].get()); - } else if(val[1].is_number()) { - end_val = ndd::filter::float_to_sortable(val[1].get()); - } else { - throw std::runtime_error("Range end must be a number"); - } - - if(start_val > end_val) { - throw std::runtime_error("Invalid range: start > end"); - } + auto start_result = sortable_from_json(val[0], "Range start"); + if(!start_result.ok()) { + return {start_result.code, start_result.message}; + } + auto end_result = sortable_from_json(val[1], "Range end"); + if(!end_result.ok()) { + return {end_result.code, end_result.message}; + } + if(start_result.value_or_throw() > end_result.value_or_throw()) { + return {2, "Invalid range: start > end"}; + } - or_result = numeric_index_->range(field, start_val, end_val); - } else { - throw std::runtime_error( - "$range operator is only supported for numeric fields"); + auto range_result = + numeric_index_->range(field, start_result.value_or_throw(), end_result.value_or_throw()); + if(!range_result.ok()) { + return {range_result.code, range_result.message}; } + or_result = std::move(range_result.value_or_throw()); } else { - throw std::runtime_error("Unsupported operator: " + op); + return {2, "Unsupported filter operator: " + op}; } - + partial_results.push_back(std::move(or_result)); } - // Optimization: Sort by cardinality (smallest first) - std::sort(partial_results.begin(), partial_results.end(), - [](const ndd::RoaringBitmap& a, const ndd::RoaringBitmap& b) { - return a.cardinality() < b.cardinality(); - }); + std::sort(partial_results.begin(), + partial_results.end(), + [](const ndd::RoaringBitmap& left, const ndd::RoaringBitmap& right) { + return left.cardinality() < right.cardinality(); + }); - if (partial_results.empty()) return ndd::RoaringBitmap(); + if(partial_results.empty()) { + return {SUCCESS, "", ndd::RoaringBitmap()}; + } ndd::RoaringBitmap final_result = partial_results[0]; for(size_t i = 1; i < partial_results.size(); ++i) { final_result &= partial_results[i]; - // If result becomes empty, stop early - if(final_result.isEmpty()) return final_result; + if(final_result.isEmpty()) { + return {SUCCESS, "", std::move(final_result)}; + } } - return final_result; + return {SUCCESS, "", std::move(final_result)}; } - // Get IDs matching the filter using the provided JSON filter array - std::vector getIdsMatchingFilter(const nlohmann::json& filter_array) const { - auto result = computeFilterBitmap(filter_array); + /* + * Returns numeric ids matching a filter query. + * + * Return codes: + * 0 = success + * 1-99 = propagated filter validation failure from bitmap computation + * 100-199 = propagated MDBX/storage failure from bitmap computation + * 200-299 = propagated corruption/invariant failure from bitmap computation + */ + ndd::OperationResult> + getIdsMatchingFilter(const nlohmann::json& filter_array) const { + auto bitmap_result = computeFilterBitmap(filter_array); + if(!bitmap_result.ok()) { + return {bitmap_result.code, bitmap_result.message}; + } + std::vector ids; - ids.reserve(result.cardinality()); - result.iterate( + ids.reserve(bitmap_result.value_or_throw().cardinality()); + bitmap_result.value_or_throw().iterate( [](ndd::idInt val, void* ptr) { static_cast*>(ptr)->push_back(val); return true; }, &ids); - return ids; + return {SUCCESS, "", std::move(ids)}; } - // Count the number of IDs matching the filter using the provided JSON filter array - size_t countIdsMatchingFilter(const nlohmann::json& filter_array) const { - return computeFilterBitmap(filter_array).cardinality(); + /* + * Counts numeric ids matching a filter query. + * + * Return codes: + * 0 = success + * 1-99 = propagated filter validation failure from bitmap computation + * 100-199 = propagated MDBX/storage failure from bitmap computation + * 200-299 = propagated corruption/invariant failure from bitmap computation + */ + ndd::OperationResult countIdsMatchingFilter(const nlohmann::json& filter_array) const { + auto bitmap_result = computeFilterBitmap(filter_array); + if(!bitmap_result.ok()) { + return {bitmap_result.code, bitmap_result.message}; + } + return {SUCCESS, "", bitmap_result.value_or_throw().cardinality()}; } - void add_to_filter(const std::string& field, const std::string& value, ndd::idInt numeric_id) { - category_index_->add(field, value, numeric_id); + /* + * Adds one id to a category filter. + * + * Return codes: + * 0 = success + * 100-199 = propagated MDBX/storage failure from category index + * 200-299 = propagated corruption/invariant failure from category index + */ + ndd::OperationResult<> + add_to_filter(const std::string& field, const std::string& value, ndd::idInt numeric_id) { + return category_index_->add(field, value, numeric_id); } - // Batch add operation for filters - void add_to_filter_batch(const std::string& filter_key, - const std::vector& numeric_ids) { + /* + * Adds a batch of ids to one already formatted category filter key. + * + * Return codes: + * 0 = success + * 100-199 = propagated MDBX/storage failure from category index + * 200-299 = propagated corruption/invariant failure from category index + */ + ndd::OperationResult<> add_to_filter_batch(const std::string& filter_key, + const std::vector& numeric_ids) { if(numeric_ids.empty()) { - return; + return {SUCCESS, ""}; } - category_index_->add_batch_by_key(filter_key, numeric_ids); + return category_index_->add_batch_by_key(filter_key, numeric_ids); } - // Optimized version to process filter JSON in batch - void add_filters_from_json_batch( + /* + * Adds one batch of filter JSON documents into the numeric and category indexes. + * + * Return codes: + * 0 = success + * 1 = invalid filter JSON or field shape; caller should return HTTP 400 + * 2 = unsupported filter field type or category value too long; caller should return HTTP 400 + * 3 = field type mismatch with existing schema; caller should return HTTP 400 + * 100-199 = propagated MDBX/storage failure from schema, numeric, or category writes + * 200-299 = propagated corruption/invariant failure from numeric or category writes + */ + ndd::OperationResult<> add_filters_from_json_batch( const std::vector>& id_filter_pairs) { if(id_filter_pairs.empty()) { - return; + return {SUCCESS, ""}; } - // Create a map to collect IDs for each label filter std::unordered_map> label_filter_to_ids; label_filter_to_ids.reserve(id_filter_pairs.size()); std::vector numeric_filter_entries; numeric_filter_entries.reserve(id_filter_pairs.size()); - // Group IDs by filter for(const auto& [numeric_id, filter_json] : id_filter_pairs) { + nlohmann::json parsed; try { - auto j = nlohmann::json::parse(filter_json); - for(const auto& [field, value] : j.items()) { - FieldType type = FieldType::Unknown; - if(value.is_boolean()) { - type = FieldType::Bool; - } else if(value.is_number()) { - type = FieldType::Number; // Unified check - } else if(value.is_string()) { - type = FieldType::String; - } + parsed = nlohmann::json::parse(filter_json); + } catch(const std::exception& e) { + return {1, "Invalid filter JSON: " + std::string(e.what())}; + } - if(type == FieldType::Unknown) { - /*This should ideally be an error or atleast an info log.*/ - LOG_INFO("Unsupported filter type for field '" << field << "'"); - continue; - } + if(!parsed.is_object()) { + return {1, "Filter JSON document must be an object"}; + } - if(!register_field_type(field, type)) { - LOG_ERROR(1202, index_id_, "Type mismatch for field '" << field << "'"); - continue; - } + for(const auto& [field, value] : parsed.items()) { + if(field.empty()) { + return {1, "Filter field name cannot be empty"}; + } - if(value.is_string()) { - std::string filter_key = format_filter_key(field, value.get()); - label_filter_to_ids[filter_key].emplace_back(numeric_id); - } else if(value.is_number()) { - uint32_t sortable_val; - if(value.is_number_integer()) { - sortable_val = ndd::filter::int_to_sortable(value.get()); - } else { - sortable_val = ndd::filter::float_to_sortable(value.get()); - } - numeric_filter_entries.emplace_back(field, numeric_id, sortable_val); - } else if(value.is_boolean()) { - std::string filter_key = - format_filter_key(field, value.get() ? "1" : "0"); - label_filter_to_ids[filter_key].emplace_back(numeric_id); - } else { - LOG_WARN(1203, - index_id_, - "Unsupported filter type for field '" << field - << "' in filter: " - << value.dump()); + FieldType type = FieldType::Unknown; + if(value.is_boolean()) { + type = FieldType::Bool; + } else if(value.is_number()) { + type = FieldType::Number; + } else if(value.is_string()) { + type = FieldType::String; + } + + if(type == FieldType::Unknown) { + return {2, "Unsupported filter type for field '" + field + "'"}; + } + + auto register_result = register_field_type(field, type); + if(!register_result.ok()) { + return register_result; + } + + if(type == FieldType::String) { + auto category_result = category_value_from_json(value, "Filter value"); + if(!category_result.ok()) { + return {category_result.code, + category_result.message + " for field '" + field + "'"}; + } + label_filter_to_ids[format_filter_key(field, category_result.value_or_throw())] + .emplace_back(numeric_id); + } else if(type == FieldType::Bool) { + label_filter_to_ids[format_filter_key(field, value.get() ? "1" : "0")] + .emplace_back(numeric_id); + } else if(type == FieldType::Number) { + auto sortable_result = sortable_from_json(value, "Numeric filter value"); + if(!sortable_result.ok()) { + return {sortable_result.code, + sortable_result.message + " for field '" + field + "'"}; } + numeric_filter_entries.emplace_back(field, numeric_id, sortable_result.value_or_throw()); } - } catch(const std::exception& e) { - LOG_ERROR(1204, index_id_, "Error parsing filter JSON: " << e.what()); } } - /** - * XXX: For transactional correctness of filter adds, all the filters - * should be added in a single transaction. - * For now, they are being added in two different transactions. - * one for numeric_index and other for labels. - */ - if(!numeric_filter_entries.empty()) { - numeric_index_->put_batch(numeric_filter_entries); + auto numeric_result = numeric_index_->put_batch(numeric_filter_entries); + if(!numeric_result.ok()) { + return numeric_result; + } } - // Process each filter with its batch of IDs for(const auto& [filter_key, ids] : label_filter_to_ids) { - add_to_filter_batch(filter_key, ids); + auto add_result = add_to_filter_batch(filter_key, ids); + if(!add_result.ok()) { + return add_result; + } } + + return {SUCCESS, ""}; } - void - remove_from_filter(const std::string& field, const std::string& value, ndd::idInt numeric_id) { - category_index_->remove(field, value, numeric_id); + /* + * Removes one id from a category filter. + * + * Return codes: + * 0 = success + * 100-199 = propagated MDBX/storage failure from category index + * 200-299 = propagated corruption/invariant failure from category index + */ + ndd::OperationResult<> + remove_from_filter(const std::string& field, + const std::string& value, + ndd::idInt numeric_id) { + return category_index_->remove(field, value, numeric_id); } - bool contains(const std::string& field, const std::string& value, ndd::idInt numeric_id) const { + /* + * Checks whether one id is present in a category filter. + * + * Return codes: + * 0 = success + * 100-199 = propagated MDBX/storage failure from category index + * 200-299 = propagated corruption/invariant failure from category index + */ + ndd::OperationResult + contains(const std::string& field, const std::string& value, ndd::idInt numeric_id) const { return category_index_->contains(field, value, numeric_id); } - void add_filters_from_json(ndd::idInt numeric_id, const std::string& filter_json) { - add_filters_from_json_batch({{numeric_id, filter_json}}); + /* + * Adds one filter JSON document into the numeric and category indexes. + * + * Return codes: + * 0 = success + * 1-99 = propagated filter validation failure from batch add + * 100-199 = propagated MDBX/storage failure from batch add + * 200-299 = propagated corruption/invariant failure from batch add + */ + ndd::OperationResult<> add_filters_from_json(ndd::idInt numeric_id, + const std::string& filter_json) { + return add_filters_from_json_batch({{numeric_id, filter_json}}); } - void remove_filters_from_json(ndd::idInt numeric_id, const std::string& filter_json) { + /* + * Removes one filter JSON document from the numeric and category indexes. + * + * Return codes: + * 0 = success + * 1 = invalid filter JSON or field shape; caller should return HTTP 400 + * 2 = unsupported filter field type; caller should return HTTP 400 + * 100-199 = propagated MDBX/storage failure from numeric or category index + * 200-299 = propagated corruption/invariant failure from numeric or category index + */ + ndd::OperationResult<> remove_filters_from_json(ndd::idInt numeric_id, + const std::string& filter_json) { + nlohmann::json parsed; try { - auto j = nlohmann::json::parse(filter_json); - for(const auto& [field, value] : j.items()) { - if(value.is_string()) { - remove_from_filter(field, value.get(), numeric_id); - } else if(value.is_number()) { - // Remove from Numeric Index - numeric_index_->remove(field, numeric_id); - } else if(value.is_boolean()) { - remove_from_filter(field, value.get() ? "1" : "0", numeric_id); + parsed = nlohmann::json::parse(filter_json); + } catch(const std::exception& e) { + return {1, "Invalid filter JSON while removing filters: " + std::string(e.what())}; + } + + if(!parsed.is_object()) { + return {1, "Filter JSON document must be an object"}; + } + + for(const auto& [field, value] : parsed.items()) { + if(field.empty()) { + return {1, "Filter field name cannot be empty"}; + } + + ndd::OperationResult<> remove_result{SUCCESS, ""}; + if(value.is_string()) { + auto category_result = category_value_from_json(value, "Filter value"); + if(!category_result.ok()) { + return {category_result.code, + category_result.message + " for field '" + field + "'"}; } + remove_result = remove_from_filter(field, category_result.value_or_throw(), numeric_id); + } else if(value.is_number()) { + remove_result = numeric_index_->remove(field, numeric_id); + } else if(value.is_boolean()) { + remove_result = remove_from_filter(field, + value.get() ? "1" : "0", + numeric_id); + } else { + return {2, "Unsupported filter type for field '" + field + "'"}; + } + + if(!remove_result.ok()) { + return remove_result; } - } catch(const std::exception& e) { - LOG_ERROR(1207, index_id_, "Error removing filters: " << e.what()); } + + return {SUCCESS, ""}; } - // Combine multiple filters using AND operation - ndd::RoaringBitmap - combine_filters_and(const std::vector>& filters) const { + /* + * Combines category filters with AND semantics. + * + * Return codes: + * 0 = success + * 100-199 = propagated MDBX/storage failure from category index + * 200-299 = propagated corruption/invariant failure from category index + */ + ndd::OperationResult combine_filters_and( + const std::vector>& filters) const { ndd::RoaringBitmap result; bool first = true; for(const auto& [field, value] : filters) { + auto bitmap_result = category_index_->get_bitmap(field, value); + if(!bitmap_result.ok()) { + return {bitmap_result.code, bitmap_result.message}; + } if(first) { - result = category_index_->get_bitmap(field, value); + result = std::move(bitmap_result.value_or_throw()); first = false; } else { - result &= category_index_->get_bitmap(field, value); + result &= bitmap_result.value_or_throw(); } } - return result; + return {SUCCESS, "", std::move(result)}; } - // Combine multiple filters using OR operation - ndd::RoaringBitmap - combine_filters_or(const std::vector>& filters) const { + /* + * Combines category filters with OR semantics. + * + * Return codes: + * 0 = success + * 100-199 = propagated MDBX/storage failure from category index + * 200-299 = propagated corruption/invariant failure from category index + */ + ndd::OperationResult combine_filters_or( + const std::vector>& filters) const { ndd::RoaringBitmap result; for(const auto& [field, value] : filters) { - result |= category_index_->get_bitmap(field, value); + auto bitmap_result = category_index_->get_bitmap(field, value); + if(!bitmap_result.ok()) { + return {bitmap_result.code, bitmap_result.message}; + } + result |= bitmap_result.value_or_throw(); } - return result; + return {SUCCESS, "", std::move(result)}; } - // Check if ID satisfies a numeric condition using Forward Index - bool check_numeric(const std::string& field, - ndd::idInt id, - const std::string& op, - const nlohmann::json& val) const { + /* + * Checks whether one id satisfies one numeric filter expression. + * + * Return codes: + * 0 = success + * 2 = invalid numeric operator or value; caller should return HTTP 400 + * 100-199 = propagated MDBX/storage failure from numeric index + * 200-299 = propagated corruption/invariant failure from numeric index + */ + ndd::OperationResult check_numeric(const std::string& field, + ndd::idInt id, + const std::string& op, + const nlohmann::json& val) const { if(op == "$eq") { - uint32_t sortable_val; - if(val.is_number_integer()) { - sortable_val = ndd::filter::int_to_sortable(val.get()); - } else if(val.is_number()) { - sortable_val = ndd::filter::float_to_sortable(val.get()); - } else { - return false; + auto sortable_result = sortable_from_json(val, "$eq value for numeric field"); + if(!sortable_result.ok()) { + return {sortable_result.code, sortable_result.message}; } - return numeric_index_->check_range(field, id, sortable_val, sortable_val); - } else if(op == "$in") { + return numeric_index_->check_range(field, + id, + sortable_result.value_or_throw(), + sortable_result.value_or_throw()); + } + + if(op == "$in") { if(!val.is_array()) { - return false; + return {2, "$in must be an array"}; } - for(const auto& v : val) { - uint32_t sortable_val; - if(v.is_number_integer()) { - sortable_val = ndd::filter::int_to_sortable(v.get()); - } else if(v.is_number()) { - sortable_val = ndd::filter::float_to_sortable(v.get()); - } else { - continue; + for(const auto& item : val) { + auto sortable_result = sortable_from_json(item, "$in value for numeric field"); + if(!sortable_result.ok()) { + return {sortable_result.code, sortable_result.message}; } - if(numeric_index_->check_range(field, id, sortable_val, sortable_val)) { - return true; + auto check_result = numeric_index_->check_range(field, + id, + sortable_result.value_or_throw(), + sortable_result.value_or_throw()); + if(!check_result.ok()) { + return check_result; + } + if(check_result.value_or_throw()) { + return {SUCCESS, "", true}; } } - return false; - } else if(op == "$range") { + return {SUCCESS, "", false}; + } + + if(op == "$range") { if(!val.is_array() || val.size() != 2) { - return false; + return {2, "$range must be [start, end] with exactly 2 values"}; } - uint32_t start_val, end_val; - if(val[0].is_number_integer()) { - start_val = ndd::filter::int_to_sortable(val[0].get()); - } else if(val[0].is_number()) { - start_val = ndd::filter::float_to_sortable(val[0].get()); - } else { - return false; + auto start_result = sortable_from_json(val[0], "Range start"); + if(!start_result.ok()) { + return {start_result.code, start_result.message}; } - - if(val[1].is_number_integer()) { - end_val = ndd::filter::int_to_sortable(val[1].get()); - } else if(val[1].is_number()) { - end_val = ndd::filter::float_to_sortable(val[1].get()); - } else { - return false; + auto end_result = sortable_from_json(val[1], "Range end"); + if(!end_result.ok()) { + return {end_result.code, end_result.message}; + } + if(start_result.value_or_throw() > end_result.value_or_throw()) { + return {2, "Invalid range: start > end"}; } - return numeric_index_->check_range(field, id, start_val, end_val); + return numeric_index_->check_range(field, id, start_result.value_or_throw(), end_result.value_or_throw()); } - return false; + + return {2, "Unsupported numeric operator: " + op}; } }; diff --git a/src/filter/numeric_index.hpp b/src/filter/numeric_index.hpp index 960545c0ec..c2206722d0 100644 --- a/src/filter/numeric_index.hpp +++ b/src/filter/numeric_index.hpp @@ -7,9 +7,11 @@ #include #include #include +#include #include "mdbx/mdbx.h" #include "../utils/log.hpp" #include "../core/types.hpp" +#include "../utils/types.hpp" namespace ndd { namespace filter { @@ -233,7 +235,6 @@ namespace ndd { return field + ":" + std::to_string(id); } - // Key Format: [Field]:[BigEndian_BaseValue] std::string make_bucket_key(const std::string& field, uint32_t start_val) { uint32_t be_val = 0; #if defined(__GNUC__) || defined(__clang__) @@ -243,12 +244,14 @@ namespace ndd { | ((start_val >> 8) & 0xff00) | ((start_val << 24) & 0xff000000); #endif std::string key = field + ":"; - key.append((char*)&be_val, 4); + key.append(reinterpret_cast(&be_val), 4); return key; } uint32_t parse_bucket_key_val(const std::string& key) { - if (key.size() < 4) return 0; + if(key.size() < 4) { + return 0; + } uint32_t be_val; std::memcpy(&be_val, key.data() + key.size() - 4, 4); #if defined(__GNUC__) || defined(__clang__) @@ -259,424 +262,627 @@ namespace ndd { #endif } - public: - NumericIndex(MDBX_env* env) : env_(env) { - MDBX_txn* txn; - if (mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn) == MDBX_SUCCESS) { - mdbx_dbi_open(txn, "numeric_forward", MDBX_CREATE, &forward_dbi_); - mdbx_dbi_open(txn, "numeric_inverted", MDBX_CREATE, &inverted_dbi_); - mdbx_txn_commit(txn); + /* + * Removes one id from the numeric inverted bucket that currently owns its old value. + * + * Return codes: + * 0 = success + * 100 = MDBX cursor, read, delete, or write failure; caller should log ERROR and return HTTP 500 + * 200 = corrupt numeric bucket payload; caller should log ERROR and return HTTP 500 + */ + ndd::OperationResult<> remove_from_buckets(MDBX_txn* txn, + const std::string& field, + uint32_t value, + ndd::idInt id) { + std::string bkey_str = make_bucket_key(field, value); + MDBX_val key{const_cast(bkey_str.data()), bkey_str.size()}; + MDBX_val data; + MDBX_cursor* cursor = nullptr; + int rc = mdbx_cursor_open(txn, inverted_dbi_, &cursor); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to open numeric bucket remove cursor: " + + std::string(mdbx_strerror(rc))}; } - } - /** - * TODO: - * 1. comprehensive error print and return. - * If there is an error here, there should be a way to reverse - * vector add operation. - */ - void put_batch(const std::vector& entries) { - if(entries.empty()) { - return; + rc = mdbx_cursor_get(cursor, &key, &data, MDBX_SET_RANGE); + if(rc == MDBX_SUCCESS) { + std::string found_key(static_cast(key.iov_base), key.iov_len); + if(found_key.rfind(field + ":", 0) != 0 + || parse_bucket_key_val(found_key) > value) { + rc = mdbx_cursor_get(cursor, &key, &data, MDBX_PREV); + } + } else if(rc == MDBX_NOTFOUND) { + rc = mdbx_cursor_get(cursor, &key, &data, MDBX_LAST); + } + + if(rc != MDBX_SUCCESS) { + mdbx_cursor_close(cursor); + if(rc == MDBX_NOTFOUND) { + return {SUCCESS, ""}; + } + return {100, "Failed to locate numeric bucket for remove: " + + std::string(mdbx_strerror(rc))}; + } + + std::string found_key(static_cast(key.iov_base), key.iov_len); + if(found_key.rfind(field + ":", 0) != 0) { + mdbx_cursor_close(cursor); + return {SUCCESS, ""}; + } + + uint32_t bucket_base = parse_bucket_key_val(found_key); + if(value < bucket_base) { + mdbx_cursor_close(cursor); + return {SUCCESS, ""}; } - MDBX_txn* txn; - mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); try { - for(const auto& entry : entries) { - put_internal(txn, entry.field, entry.id, entry.value); + Bucket bucket = Bucket::deserialize(data.iov_base, data.iov_len, bucket_base); + if(bucket.remove(id)) { + if(bucket.is_empty()) { + rc = mdbx_cursor_del(cursor, static_cast(0)); + if(rc != MDBX_SUCCESS) { + mdbx_cursor_close(cursor); + return {100, "Failed to delete empty numeric bucket: " + + std::string(mdbx_strerror(rc))}; + } + } else { + auto bytes = bucket.serialize(); + MDBX_val new_data{bytes.data(), bytes.size()}; + rc = mdbx_cursor_put(cursor, &key, &new_data, MDBX_CURRENT); + if(rc != MDBX_SUCCESS) { + mdbx_cursor_close(cursor); + return {100, "Failed to update numeric bucket after remove: " + + std::string(mdbx_strerror(rc))}; + } + } } - mdbx_txn_commit(txn); - } catch(...) { - mdbx_txn_abort(txn); - throw; + } catch(const std::exception& e) { + mdbx_cursor_close(cursor); + return {200, "Corrupt numeric bucket while removing id: " + + std::string(e.what())}; } + + mdbx_cursor_close(cursor); + return {SUCCESS, ""}; } - void remove(const std::string& field, ndd::idInt id) { - MDBX_txn* txn; - mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); - try { - std::string fwd_key_str = make_forward_key(field, id); - MDBX_val fwd_key{const_cast(fwd_key_str.data()), fwd_key_str.size()}; - MDBX_val fwd_val; - - if(mdbx_get(txn, forward_dbi_, &fwd_key, &fwd_val) == MDBX_SUCCESS) { - uint32_t old_val; - std::memcpy(&old_val, fwd_val.iov_base, sizeof(uint32_t)); - remove_from_buckets(txn, field, old_val, id); - mdbx_del(txn, forward_dbi_, &fwd_key, nullptr); + /* + * Adds one id/value pair into the numeric inverted bucket index. + * + * Return codes: + * 0 = success + * 100 = MDBX cursor, read, or write failure; caller should log ERROR and return HTTP 500 + * 200 = corrupt numeric bucket payload or invalid bucket invariant; caller should log ERROR and return HTTP 500 + */ + ndd::OperationResult<> add_to_buckets(MDBX_txn* txn, + const std::string& field, + uint32_t value, + ndd::idInt id) { + MDBX_cursor* cursor = nullptr; + int rc = mdbx_cursor_open(txn, inverted_dbi_, &cursor); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to open numeric bucket add cursor: " + + std::string(mdbx_strerror(rc))}; + } + + std::string search_key = make_bucket_key(field, value); + MDBX_val key{const_cast(search_key.data()), search_key.size()}; + MDBX_val data; + + rc = mdbx_cursor_get(cursor, &key, &data, MDBX_SET_RANGE); + if(rc == MDBX_SUCCESS) { + std::string found_key(static_cast(key.iov_base), key.iov_len); + if(found_key.rfind(field + ":", 0) != 0 + || parse_bucket_key_val(found_key) > value) { + int prev_rc = mdbx_cursor_get(cursor, &key, &data, MDBX_PREV); + if(prev_rc == MDBX_SUCCESS) { + rc = prev_rc; + } else if(prev_rc != MDBX_NOTFOUND) { + mdbx_cursor_close(cursor); + return {100, "Failed to seek previous numeric bucket: " + + std::string(mdbx_strerror(prev_rc))}; + } else { + rc = MDBX_NOTFOUND; + } } + } else if(rc == MDBX_NOTFOUND) { + rc = mdbx_cursor_get(cursor, &key, &data, MDBX_LAST); + if(rc != MDBX_SUCCESS && rc != MDBX_NOTFOUND) { + mdbx_cursor_close(cursor); + return {100, "Failed to seek last numeric bucket: " + + std::string(mdbx_strerror(rc))}; + } + } else { + mdbx_cursor_close(cursor); + return {100, "Failed to seek numeric bucket: " + + std::string(mdbx_strerror(rc))}; + } - mdbx_txn_commit(txn); - } catch(...) { - mdbx_txn_abort(txn); - throw; + bool create_new = true; + std::string target_key_str; + uint32_t target_base = 0; + if(rc == MDBX_SUCCESS) { + std::string found_key(static_cast(key.iov_base), key.iov_len); + if(found_key.rfind(field + ":", 0) == 0) { + target_base = parse_bucket_key_val(found_key); + if(value >= target_base + && (static_cast(value) - target_base) + <= Bucket::MAX_DELTA) { + target_key_str = found_key; + create_new = false; + } + } } + + try { + if(create_new) { + Bucket bucket; + bucket.base_value = value; + bucket.add(value, id); + auto bytes = bucket.serialize(); + + target_key_str = make_bucket_key(field, value); + MDBX_val k{const_cast(target_key_str.data()), + target_key_str.size()}; + MDBX_val v{bytes.data(), bytes.size()}; + rc = mdbx_put(txn, inverted_dbi_, &k, &v, MDBX_UPSERT); + if(rc != MDBX_SUCCESS) { + mdbx_cursor_close(cursor); + return {100, "Failed to create numeric bucket: " + + std::string(mdbx_strerror(rc))}; + } + } else { + MDBX_val k{const_cast(target_key_str.data()), + target_key_str.size()}; + MDBX_val v; + rc = mdbx_cursor_get(cursor, &k, &v, MDBX_SET); + if(rc != MDBX_SUCCESS) { + mdbx_cursor_close(cursor); + return {200, "Failed to resync numeric bucket cursor: " + + std::string(mdbx_strerror(rc))}; + } + + Bucket bucket = Bucket::deserialize(v.iov_base, v.iov_len, target_base); + if(bucket.ids.size() >= Bucket::MAX_SIZE) { + size_t mid_idx = bucket.ids.size() / 2; + size_t probe_right = mid_idx; + while(probe_right < bucket.deltas.size() && probe_right > 0 + && bucket.deltas[probe_right] + == bucket.deltas[probe_right - 1]) { + probe_right++; + } + + if(probe_right < bucket.deltas.size()) { + mid_idx = probe_right; + } else { + size_t probe_left = mid_idx; + while(probe_left > 0 + && bucket.deltas[probe_left] + == bucket.deltas[probe_left - 1]) { + probe_left--; + } + mid_idx = probe_left > 0 ? probe_left : bucket.deltas.size(); + } + + if(mid_idx == bucket.deltas.size()) { + bucket.add(value, id); + auto bytes = bucket.serialize(); + MDBX_val k2{const_cast(target_key_str.data()), + target_key_str.size()}; + MDBX_val v2{bytes.data(), bytes.size()}; + rc = mdbx_cursor_put(cursor, &k2, &v2, MDBX_CURRENT); + mdbx_cursor_close(cursor); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to update overfull numeric bucket: " + + std::string(mdbx_strerror(rc))}; + } + return {SUCCESS, ""}; + } + + Bucket right_bucket; + right_bucket.base_value = bucket.base_value + bucket.deltas[mid_idx]; + for(size_t i = mid_idx; i < bucket.deltas.size(); ++i) { + right_bucket.add(bucket.base_value + bucket.deltas[i], + bucket.ids[i]); + } + + bucket.deltas.resize(mid_idx); + bucket.ids.resize(mid_idx); + bucket.summary_bitmap = ndd::RoaringBitmap(); + for(auto bucket_id : bucket.ids) { + bucket.summary_bitmap.add(bucket_id); + } + + if(value >= right_bucket.base_value) { + right_bucket.add(value, id); + } else { + bucket.add(value, id); + } + + auto left_bytes = bucket.serialize(); + MDBX_val left_v{left_bytes.data(), left_bytes.size()}; + MDBX_val left_k{const_cast(target_key_str.data()), + target_key_str.size()}; + rc = mdbx_cursor_put(cursor, &left_k, &left_v, MDBX_CURRENT); + if(rc != MDBX_SUCCESS) { + mdbx_cursor_close(cursor); + return {100, "Failed to update split numeric bucket: " + + std::string(mdbx_strerror(rc))}; + } + + auto right_bytes = right_bucket.serialize(); + std::string right_k_str = + make_bucket_key(field, right_bucket.base_value); + MDBX_val right_k{const_cast(right_k_str.data()), + right_k_str.size()}; + MDBX_val right_v{right_bytes.data(), right_bytes.size()}; + rc = mdbx_put(txn, inverted_dbi_, &right_k, &right_v, MDBX_UPSERT); + if(rc != MDBX_SUCCESS) { + mdbx_cursor_close(cursor); + return {100, "Failed to write split numeric bucket: " + + std::string(mdbx_strerror(rc))}; + } + } else { + bucket.add(value, id); + auto bytes = bucket.serialize(); + MDBX_val new_data{bytes.data(), bytes.size()}; + rc = mdbx_cursor_put(cursor, &k, &new_data, MDBX_CURRENT); + if(rc != MDBX_SUCCESS) { + mdbx_cursor_close(cursor); + return {100, "Failed to update numeric bucket: " + + std::string(mdbx_strerror(rc))}; + } + } + } + } catch(const std::exception& e) { + mdbx_cursor_close(cursor); + return {200, "Corrupt numeric bucket while adding id: " + + std::string(e.what())}; + } + + mdbx_cursor_close(cursor); + return {SUCCESS, ""}; } - private: - void put_internal(MDBX_txn* txn, const std::string& field, ndd::idInt id, uint32_t value) { - // 1. Check Forward Index + /* + * Writes one numeric forward entry and updates the inverted buckets inside a caller transaction. + * + * Return codes: + * 0 = success + * 100 = MDBX read or write failure; caller should log ERROR and return HTTP 500 + * 100-199 = propagated MDBX/storage failure from bucket helpers + * 200 = corrupt numeric forward value; caller should log ERROR and return HTTP 500 + * 200-299 = propagated corruption/invariant failure from bucket helpers + */ + ndd::OperationResult<> put_internal(MDBX_txn* txn, + const std::string& field, + ndd::idInt id, + uint32_t value) { std::string fwd_key_str = make_forward_key(field, id); MDBX_val fwd_key{const_cast(fwd_key_str.data()), fwd_key_str.size()}; MDBX_val fwd_val; - if (mdbx_get(txn, forward_dbi_, &fwd_key, &fwd_val) == MDBX_SUCCESS) { + int rc = mdbx_get(txn, forward_dbi_, &fwd_key, &fwd_val); + if(rc == MDBX_SUCCESS) { + if(fwd_val.iov_len != sizeof(uint32_t)) { + return {200, "Corrupt numeric forward value for field '" + field + "'"}; + } uint32_t old_val; - std::memcpy(&old_val, fwd_val.iov_base, 4); - if (old_val == value) return; - remove_from_buckets(txn, field, old_val, id); + std::memcpy(&old_val, fwd_val.iov_base, sizeof(uint32_t)); + if(old_val == value) { + return {SUCCESS, ""}; + } + auto remove_result = remove_from_buckets(txn, field, old_val, id); + if(!remove_result.ok()) { + return remove_result; + } + } else if(rc != MDBX_NOTFOUND) { + return {100, "Failed to read numeric forward value: " + + std::string(mdbx_strerror(rc))}; } - // 2. Update Forward MDBX_val new_val_data{&value, sizeof(uint32_t)}; - mdbx_put(txn, forward_dbi_, &fwd_key, &new_val_data, MDBX_UPSERT); + rc = mdbx_put(txn, forward_dbi_, &fwd_key, &new_val_data, MDBX_UPSERT); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to write numeric forward value: " + + std::string(mdbx_strerror(rc))}; + } - // 3. Add to Inverted Buckets - add_to_buckets(txn, field, value, id); + return add_to_buckets(txn, field, value, id); } - void remove_from_buckets(MDBX_txn* txn, const std::string& field, uint32_t value, ndd::idInt id) { - // Find bucket - std::string bkey_str = make_bucket_key(field, value); - MDBX_val key{const_cast(bkey_str.data()), bkey_str.size()}; - MDBX_val data; - MDBX_cursor* cursor; - mdbx_cursor_open(txn, inverted_dbi_, &cursor); + public: + NumericIndex(MDBX_env* env) : + env_(env) { + MDBX_txn* txn = nullptr; + int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); + if(rc != MDBX_SUCCESS) { + throw std::runtime_error(std::string("Failed to begin NumericIndex init: ") + + mdbx_strerror(rc)); + } - // Scan backward to find bucket covering 'value' - int rc = mdbx_cursor_get(cursor, &key, &data, MDBX_SET_RANGE); - - // Logic to find correct bucket: - std::string found_key; - - if (rc == MDBX_SUCCESS) { - found_key = std::string((char*)key.iov_base, key.iov_len); - // Check if we are in right field & range - if (found_key.rfind(field + ":", 0) != 0 || parse_bucket_key_val(found_key) > value) { - rc = mdbx_cursor_get(cursor, &key, &data, MDBX_PREV); - } - } else if (rc == MDBX_NOTFOUND) { - /** - * The only possible bucket that could still contain - * value is the very last bucket in the database. - * Hence jumping there. - */ - rc = mdbx_cursor_get(cursor, &key, &data, MDBX_LAST); - } - - // Should be at correct bucket now - if (rc == MDBX_SUCCESS) { - found_key = std::string((char*)key.iov_base, key.iov_len); - if (found_key.rfind(field + ":", 0) == 0) { - uint32_t bucket_base = parse_bucket_key_val(found_key); - if (value >= bucket_base) { - Bucket b = Bucket::deserialize(data.iov_base, data.iov_len, bucket_base); - if (b.remove(id)) { - // Save back or Delete if empty - if (b.is_empty()) { - mdbx_cursor_del(cursor, static_cast(0)); - } else { - auto bytes = b.serialize(); - MDBX_val new_data{bytes.data(), bytes.size()}; - mdbx_cursor_put(cursor, &key, &new_data, MDBX_CURRENT); - } - } - } - } + rc = mdbx_dbi_open(txn, "numeric_forward", MDBX_CREATE, &forward_dbi_); + if(rc != MDBX_SUCCESS) { + mdbx_txn_abort(txn); + throw std::runtime_error(std::string("Failed to open numeric_forward dbi: ") + + mdbx_strerror(rc)); } - mdbx_cursor_close(cursor); - } - void add_to_buckets(MDBX_txn* txn, const std::string& field, uint32_t value, ndd::idInt id) { - MDBX_cursor* cursor; - mdbx_cursor_open(txn, inverted_dbi_, &cursor); + rc = mdbx_dbi_open(txn, "numeric_inverted", MDBX_CREATE, &inverted_dbi_); + if(rc != MDBX_SUCCESS) { + mdbx_txn_abort(txn); + throw std::runtime_error(std::string("Failed to open numeric_inverted dbi: ") + + mdbx_strerror(rc)); + } - // Find candidate bucket - std::string search_key = make_bucket_key(field, value); - MDBX_val key{const_cast(search_key.data()), search_key.size()}; - MDBX_val data; + rc = mdbx_txn_commit(txn); + if(rc != MDBX_SUCCESS) { + throw std::runtime_error(std::string("Failed to commit NumericIndex init: ") + + mdbx_strerror(rc)); + } + } - int rc = mdbx_cursor_get(cursor, &key, &data, MDBX_SET_RANGE); - - bool create_new = false; - std::string target_key_str; - uint32_t target_base = 0; + /* + * Writes a batch of numeric filter entries in one MDBX write transaction. + * + * Return codes: + * 0 = success + * 100 = MDBX transaction or commit failure; caller should log ERROR and return HTTP 500 + * 100-199 = propagated MDBX/storage failure from per-entry writes + * 200-299 = propagated corruption/invariant failure from per-entry writes + */ + ndd::OperationResult<> put_batch(const std::vector& entries) { + if(entries.empty()) { + return {SUCCESS, ""}; + } - // Move logic to find predecessor - if (rc == MDBX_SUCCESS) { - std::string found_key((char*)key.iov_base, key.iov_len); - if (found_key.rfind(field + ":", 0) != 0 || parse_bucket_key_val(found_key) > value) { - rc = mdbx_cursor_get(cursor, &key, &data, MDBX_PREV); - } - } else { - rc = mdbx_cursor_get(cursor, &key, &data, MDBX_LAST); + MDBX_txn* txn = nullptr; + int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to begin numeric batch write transaction: " + + std::string(mdbx_strerror(rc))}; } - if (rc == MDBX_SUCCESS) { - std::string found_key((char*)key.iov_base, key.iov_len); - if (found_key.rfind(field + ":", 0) == 0) { - target_base = parse_bucket_key_val(found_key); - // Check range condition - if (value >= target_base && (static_cast(value) - target_base) <= Bucket::MAX_DELTA) { - target_key_str = found_key; - } else { - create_new = true; - } - } else { - create_new = true; + for(const auto& entry : entries) { + auto put_result = put_internal(txn, entry.field, entry.id, entry.value); + if(!put_result.ok()) { + mdbx_txn_abort(txn); + return put_result; } - } else { - create_new = true; } - if (create_new) { - // Create new bucket at exact value - Bucket b; - b.base_value = value; - b.add(value, id); - auto bytes = b.serialize(); - - target_key_str = make_bucket_key(field, value); - MDBX_val k{const_cast(target_key_str.data()), target_key_str.size()}; - MDBX_val v{bytes.data(), bytes.size()}; - mdbx_put(txn, inverted_dbi_, &k, &v, MDBX_UPSERT); - - } else { - // Update existing - // We must re-fetch current key/data because cursor move might have updated key/data - MDBX_val k{const_cast(target_key_str.data()), target_key_str.size()}; - MDBX_val v; - if(mdbx_cursor_get(cursor, &k, &v, MDBX_SET) != MDBX_SUCCESS) { - // Should not happen if logic is correct - throw std::runtime_error("Cursor sync fail"); - } - - Bucket b = Bucket::deserialize(v.iov_base, v.iov_len, target_base); - - // Capacity Check - if (b.ids.size() >= Bucket::MAX_SIZE) { - // SPLIT LOGIC - // Sort is maintained by arrays. - // "Slide Split": Scan right from median - size_t mid_idx = b.ids.size() / 2; - - // Ensure we don't split a group of identical values - size_t probe_right = mid_idx; - while (probe_right < b.deltas.size() && probe_right > 0 && b.deltas[probe_right] == b.deltas[probe_right - 1]) { - probe_right++; - } - - if (probe_right < b.deltas.size()) { - mid_idx = probe_right; - } else { - // Fallback: Try scanning left - size_t probe_left = mid_idx; - while (probe_left > 0 && b.deltas[probe_left] == b.deltas[probe_left - 1]) { - probe_left--; - } - - if (probe_left > 0) { - mid_idx = probe_left; - } else { - // All identical - mid_idx = b.deltas.size(); - } - } - - // If we hit end, we can't split by value uniqueness - if (mid_idx == b.deltas.size()) { - // Fallback: Just append (overfill) or implement logic to handle identicals. - // For now: Append - b.add(value, id); - auto bytes = b.serialize(); - MDBX_val k2{const_cast(target_key_str.data()), target_key_str.size()}; - MDBX_val v2{bytes.data(), bytes.size()}; - mdbx_cursor_put(cursor, &k2, &v2, MDBX_CURRENT); - mdbx_cursor_close(cursor); - return; - } - - // Standard Slide Split - Bucket right_b; - right_b.base_value = b.base_value + b.deltas[mid_idx]; // New base - - // Move entries - for(size_t i=mid_idx; i= right_b.base_value) { - right_b.add(value, id); - } else { - // If value < right, goes to left. - // But wait, split point was determined by existing items. - // If new value is >= base+split_delta, it goes right. - // BUT we just cleared right from b. - // Correct logic: - b.add(value, id); // Add to left if it fits range (logic handles delta) - // Oh wait, if we added to left, we might overflow again or break order? - // Simply: Check which bucket covers it. - // Left covers [Base, RightBase-1] - // Right covers [RightBase, ...] - } - - // Save Left - auto left_bytes = b.serialize(); - MDBX_val left_v{left_bytes.data(), left_bytes.size()}; - MDBX_val left_k{const_cast(target_key_str.data()), target_key_str.size()}; - mdbx_cursor_put(cursor, &left_k, &left_v, MDBX_CURRENT); - - // Save Right - auto right_bytes = right_b.serialize(); - std::string right_k_str = make_bucket_key(field, right_b.base_value); - MDBX_val right_k{const_cast(right_k_str.data()), right_k_str.size()}; - MDBX_val right_v{right_bytes.data(), right_bytes.size()}; - - // Use put for new key - mdbx_put(txn, inverted_dbi_, &right_k, &right_v, MDBX_UPSERT); + rc = mdbx_txn_commit(txn); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to commit numeric batch write transaction: " + + std::string(mdbx_strerror(rc))}; + } + return {SUCCESS, ""}; + } - } else { - // Normal Insert - b.add(value, id); - auto bytes = b.serialize(); - MDBX_val new_data{bytes.data(), bytes.size()}; - - // Use cursor put to update current - mdbx_cursor_put(cursor, &k, &new_data, MDBX_CURRENT); - } + /* + * Removes one id from the numeric forward and inverted indexes for a field. + * + * Return codes: + * 0 = success + * 100 = MDBX transaction, read, delete, or commit failure; caller should log ERROR and return HTTP 500 + * 100-199 = propagated MDBX/storage failure from bucket helpers + * 200 = corrupt numeric forward value; caller should log ERROR and return HTTP 500 + * 200-299 = propagated corruption/invariant failure from bucket helpers + */ + ndd::OperationResult<> remove(const std::string& field, ndd::idInt id) { + MDBX_txn* txn = nullptr; + int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to begin numeric remove transaction: " + + std::string(mdbx_strerror(rc))}; } - mdbx_cursor_close(cursor); + + std::string fwd_key_str = make_forward_key(field, id); + MDBX_val fwd_key{const_cast(fwd_key_str.data()), fwd_key_str.size()}; + MDBX_val fwd_val; + + rc = mdbx_get(txn, forward_dbi_, &fwd_key, &fwd_val); + if(rc == MDBX_NOTFOUND) { + mdbx_txn_abort(txn); + return {SUCCESS, ""}; + } + if(rc != MDBX_SUCCESS) { + mdbx_txn_abort(txn); + return {100, "Failed to read numeric forward value for remove: " + + std::string(mdbx_strerror(rc))}; + } + if(fwd_val.iov_len != sizeof(uint32_t)) { + mdbx_txn_abort(txn); + return {200, "Corrupt numeric forward value for field '" + field + "'"}; + } + + uint32_t old_val; + std::memcpy(&old_val, fwd_val.iov_base, sizeof(uint32_t)); + auto remove_result = remove_from_buckets(txn, field, old_val, id); + if(!remove_result.ok()) { + mdbx_txn_abort(txn); + return remove_result; + } + + rc = mdbx_del(txn, forward_dbi_, &fwd_key, nullptr); + if(rc != MDBX_SUCCESS && rc != MDBX_NOTFOUND) { + mdbx_txn_abort(txn); + return {100, "Failed to delete numeric forward value: " + + std::string(mdbx_strerror(rc))}; + } + + rc = mdbx_txn_commit(txn); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to commit numeric remove transaction: " + + std::string(mdbx_strerror(rc))}; + } + return {SUCCESS, ""}; } - public: - ndd::RoaringBitmap range(const std::string& field, uint32_t min_val, uint32_t max_val) { + /* + * Computes a bitmap of ids whose numeric field value falls within an inclusive sortable range. + * + * Return codes: + * 0 = success + * 100 = MDBX transaction, cursor, or scan failure; caller should log ERROR and return HTTP 500 + * 200 = corrupt numeric bucket payload; caller should log ERROR and return HTTP 500 + */ + ndd::OperationResult + range(const std::string& field, uint32_t min_val, uint32_t max_val) { ndd::RoaringBitmap result; - MDBX_txn* txn; - if (mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn) != MDBX_SUCCESS) return result; + MDBX_txn* txn = nullptr; + int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to begin numeric range transaction: " + + std::string(mdbx_strerror(rc))}; + } - MDBX_cursor* cursor; - mdbx_cursor_open(txn, inverted_dbi_, &cursor); + MDBX_cursor* cursor = nullptr; + rc = mdbx_cursor_open(txn, inverted_dbi_, &cursor); + if(rc != MDBX_SUCCESS) { + mdbx_txn_abort(txn); + return {100, "Failed to open numeric range cursor: " + + std::string(mdbx_strerror(rc))}; + } - // 1. Find Start Bucket std::string start_k = make_bucket_key(field, min_val); MDBX_val key{const_cast(start_k.data()), start_k.size()}; MDBX_val data; - int rc = mdbx_cursor_get(cursor, &key, &data, MDBX_SET_RANGE); - if (rc == MDBX_SUCCESS) { - // Check if we need to back up - std::string fkey((char*)key.iov_base, key.iov_len); - if (fkey.rfind(field + ":", 0) != 0 || parse_bucket_key_val(fkey) > min_val) { - // Check prev - MDBX_val p_key = key; - MDBX_val p_data; - if (mdbx_cursor_get(cursor, &p_key, &p_data, MDBX_PREV) == MDBX_SUCCESS) { - std::string pkey_str((char*)p_key.iov_base, p_key.iov_len); - if (pkey_str.rfind(field + ":", 0) == 0) { - // Prev is valid start - key = p_key; data = p_data; - rc = MDBX_SUCCESS; - } - } - } - } else if (rc == MDBX_NOTFOUND) { - rc = mdbx_cursor_get(cursor, &key, &data, MDBX_LAST); - if (rc == MDBX_SUCCESS && data.iov_len > 0) { - std::string fkey((char*)key.iov_base, key.iov_len); - if (fkey.rfind(field + ":", 0) == 0) { - rc = MDBX_SUCCESS; - } else { - rc = MDBX_NOTFOUND; - } - } else { - rc = MDBX_NOTFOUND; - } - } - - // Iterate forward - while (rc == MDBX_SUCCESS) { - std::string cur_key((char*)key.iov_base, key.iov_len); - if (cur_key.rfind(field + ":", 0) != 0) break; // End of field - - uint32_t bucket_base = parse_bucket_key_val(cur_key); - - if (bucket_base > max_val) break; // Past the end - - // Peek Strategy: - // If bucket_base >= min_val, we know the start is covered. - // If we could know NEXT bucket start, we'd know overlap. - // Since we iterate, we can be greedy on read. - - // For now, always deserialize. - // Potential optimization: Read only bitmap if we are "deep" in the range. - // e.g. min_val=10, max_val=100. Bucket=20. - // If bucket=20. Next Bucket=30. - // Then Bucket 20 covers [20..30). - // Range [10..100] covers [20..30] fully. - // So we need lookahead. - - // Simple logic without lookahead: - // Just read full bucket. It's 8KB max (2 pages). - // It's fast unless we have millions of buckets. - - Bucket b = Bucket::deserialize(data.iov_base, data.iov_len, bucket_base); - - if (b.ids.empty()) { - rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT); - continue; + rc = mdbx_cursor_get(cursor, &key, &data, MDBX_SET_RANGE); + if(rc == MDBX_SUCCESS) { + std::string fkey(static_cast(key.iov_base), key.iov_len); + if(fkey.rfind(field + ":", 0) != 0 || parse_bucket_key_val(fkey) > min_val) { + MDBX_val prev_key = key; + MDBX_val prev_data; + int prev_rc = mdbx_cursor_get(cursor, &prev_key, &prev_data, MDBX_PREV); + if(prev_rc == MDBX_SUCCESS) { + std::string prev_key_str(static_cast(prev_key.iov_base), + prev_key.iov_len); + if(prev_key_str.rfind(field + ":", 0) == 0) { + key = prev_key; + data = prev_data; + } + } else if(prev_rc != MDBX_NOTFOUND) { + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); + return {100, "Failed to seek previous numeric range bucket: " + + std::string(mdbx_strerror(prev_rc))}; + } } + } else if(rc == MDBX_NOTFOUND) { + rc = mdbx_cursor_get(cursor, &key, &data, MDBX_LAST); + if(rc == MDBX_SUCCESS) { + std::string fkey(static_cast(key.iov_base), key.iov_len); + if(fkey.rfind(field + ":", 0) != 0) { + rc = MDBX_NOTFOUND; + } + } else if(rc != MDBX_NOTFOUND) { + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); + return {100, "Failed to seek last numeric range bucket: " + + std::string(mdbx_strerror(rc))}; + } + } else { + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); + return {100, "Failed to seek numeric range bucket: " + + std::string(mdbx_strerror(rc))}; + } - uint32_t b_min = b.get_value(0); - uint32_t b_max = b.get_value(b.ids.size()-1); + try { + while(rc == MDBX_SUCCESS) { + std::string cur_key(static_cast(key.iov_base), key.iov_len); + if(cur_key.rfind(field + ":", 0) != 0) { + break; + } - if (b_min >= min_val && b_max <= max_val) { - // Full overlap - result |= b.summary_bitmap; - } else { - // Partial overlap - for(size_t i=0; i= min_val && v <= max_val) { - result.add(b.ids[i]); - } - } - } + uint32_t bucket_base = parse_bucket_key_val(cur_key); + if(bucket_base > max_val) { + break; + } + + Bucket bucket = Bucket::deserialize(data.iov_base, + data.iov_len, + bucket_base); + if(!bucket.ids.empty()) { + uint32_t bucket_min = bucket.get_value(0); + uint32_t bucket_max = bucket.get_value(bucket.ids.size() - 1); + + if(bucket_min >= min_val && bucket_max <= max_val) { + result |= bucket.summary_bitmap; + } else { + for(size_t i = 0; i < bucket.ids.size(); ++i) { + uint32_t value = bucket.get_value(i); + if(value >= min_val && value <= max_val) { + result.add(bucket.ids[i]); + } + } + } + } - rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT); + rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT); + } + } catch(const std::exception& e) { + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); + return {200, "Corrupt numeric bucket during range scan: " + + std::string(e.what())}; } mdbx_cursor_close(cursor); mdbx_txn_abort(txn); - return result; + if(rc != MDBX_SUCCESS && rc != MDBX_NOTFOUND) { + return {100, "Failed during numeric range scan: " + + std::string(mdbx_strerror(rc))}; + } + return {SUCCESS, "", std::move(result)}; } - bool check_range(const std::string& field, ndd::idInt id, uint32_t min_val, uint32_t max_val) { - MDBX_txn* txn; - if(mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn) != MDBX_SUCCESS) return false; - + /* + * Checks whether one id has a numeric field value inside an inclusive sortable range. + * + * Return codes: + * 0 = success + * 100 = MDBX transaction or read failure; caller should log ERROR and return HTTP 500 + * 200 = corrupt numeric forward value; caller should log ERROR and return HTTP 500 + */ + ndd::OperationResult + check_range(const std::string& field, + ndd::idInt id, + uint32_t min_val, + uint32_t max_val) { + MDBX_txn* txn = nullptr; + int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to begin numeric check transaction: " + + std::string(mdbx_strerror(rc))}; + } + std::string fwd_key_str = make_forward_key(field, id); MDBX_val fwd_key{const_cast(fwd_key_str.data()), fwd_key_str.size()}; MDBX_val fwd_val; - - bool match = false; - if(mdbx_get(txn, forward_dbi_, &fwd_key, &fwd_val) == MDBX_SUCCESS) { - uint32_t val; - std::memcpy(&val, fwd_val.iov_base, 4); - if(val >= min_val && val <= max_val) match = true; + + rc = mdbx_get(txn, forward_dbi_, &fwd_key, &fwd_val); + if(rc == MDBX_NOTFOUND) { + mdbx_txn_abort(txn); + return {SUCCESS, "", false}; } - + if(rc != MDBX_SUCCESS) { + mdbx_txn_abort(txn); + return {100, "Failed to read numeric forward value during check: " + + std::string(mdbx_strerror(rc))}; + } + if(fwd_val.iov_len != sizeof(uint32_t)) { + mdbx_txn_abort(txn); + return {200, "Corrupt numeric forward value for field '" + field + "'"}; + } + + uint32_t value; + std::memcpy(&value, fwd_val.iov_base, sizeof(uint32_t)); mdbx_txn_abort(txn); - return match; + return {SUCCESS, "", value >= min_val && value <= max_val}; } }; diff --git a/src/main.cpp b/src/main.cpp index 28918be54f..1fe4bab119 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -88,6 +88,21 @@ inline crow::response json_error(int code, const std::string& message) { crow::json::wvalue err_json({{"error", message}}); return crow::response(code, err_json.dump()); } + + +/** + * OperationResult code ranges are the contract between core/filter code and the HTTP + * boundary: SUCCESS means success, 1-99 means the request was rejected for caller-fixable + * input such as filter validation, and 100+ means storage/internal/corruption failure. + * When adding an OperationResult-returning function, document its code ranges and keep + * client-correctable errors below 100 so this helper maps them to HTTP 400 instead of + * the 500 path. + */ +template +inline bool operation_error_is_client_error(const ndd::OperationResult& result) { + return result.code > SUCCESS && result.code < 100; +} + // Special helper function to log and send error messages in JSON format for 500 errors inline crow::response json_error_500(const std::string& username, const std::string& index_name, @@ -207,7 +222,7 @@ int main(int argc, char** argv) { } if(!run_startup_sanity_checks()) { - LOG_ERROR(1799, "Server startup aborted due to failed sanity checks"); + LOG_ERROR(1076, "Server startup aborted due to failed sanity checks"); return 1; } @@ -855,14 +870,23 @@ int main(int argc, char** argv) { dense_rrf_weight, rrf_rank_constant); - if(!search_response) { - LOG_WARN(1038, ctx.username, index_name, "Search request returned no results because the index is missing or search failed"); - return json_error(404, "Index not found or search failed"); + if(!search_response.ok()) { + if(operation_error_is_client_error(search_response)) { + LOG_WARN(1075, + ctx.username, + index_name, + "Search request rejected: " << search_response.message); + return json_error(400, search_response.message); + } + return json_error_500(ctx.username, + index_name, + req.url, + search_response.message); } // Serialize the ResultSet using MessagePack msgpack::sbuffer sbuf; - msgpack::pack(sbuf, search_response.value()); + msgpack::pack(sbuf, search_response.value_or_throw()); crow::response resp(200, std::string(sbuf.data(), sbuf.size())); resp.add_header("Content-Type", "application/msgpack"); return resp; @@ -956,8 +980,21 @@ int main(int argc, char** argv) { } try { - bool success = index_manager.addVectors(index_id, vectors); - if(!success) { + auto insert_result = index_manager.addVectors(index_id, vectors); + if(!insert_result.ok()) { + if(operation_error_is_client_error(insert_result)) { + LOG_WARN(1069, + ctx.username, + index_name, + "Insert request rejected: " << insert_result.message); + return json_error(400, insert_result.message); + } + return json_error_500(ctx.username, + index_name, + req.url, + insert_result.message); + } + if(!insert_result.value_or_throw()) { LOG_WARN(1066, ctx.username, index_name, @@ -981,8 +1018,21 @@ int main(int argc, char** argv) { // Try HybridVectorObject first auto vectors = obj.as>(); LOG_DEBUG("Batch size (Hybrid): " << vectors.size()); - bool success = index_manager.addVectors(index_id, vectors); - if(!success) { + auto insert_result = index_manager.addVectors(index_id, vectors); + if(!insert_result.ok()) { + if(operation_error_is_client_error(insert_result)) { + LOG_WARN(1070, + ctx.username, + index_name, + "Insert request rejected: " << insert_result.message); + return json_error(400, insert_result.message); + } + return json_error_500(ctx.username, + index_name, + req.url, + insert_result.message); + } + if(!insert_result.value_or_throw()) { LOG_WARN(1067, ctx.username, index_name, @@ -994,8 +1044,21 @@ int main(int argc, char** argv) { // Fallback to VectorObject auto vectors = obj.as>(); LOG_DEBUG("Batch size (Dense): " << vectors.size()); - bool success = index_manager.addVectors(index_id, vectors); - if(!success) { + auto insert_result = index_manager.addVectors(index_id, vectors); + if(!insert_result.ok()) { + if(operation_error_is_client_error(insert_result)) { + LOG_WARN(1071, + ctx.username, + index_name, + "Insert request rejected: " << insert_result.message); + return json_error(400, insert_result.message); + } + return json_error_500(ctx.username, + index_name, + req.url, + insert_result.message); + } + if(!insert_result.value_or_throw()) { LOG_WARN(1068, ctx.username, index_name, @@ -1067,7 +1130,21 @@ int main(int argc, char** argv) { LOG_DEBUG("Deleting vector " << vector_id << " from index " << index_id); try { - if(index_manager.deleteVector(index_id, vector_id)) { + auto delete_result = index_manager.deleteVector(index_id, vector_id); + if(!delete_result.ok()) { + if(operation_error_is_client_error(delete_result)) { + LOG_WARN(1072, + ctx.username, + index_name, + "Delete-vector request rejected: " << delete_result.message); + return json_error(400, delete_result.message); + } + return json_error_500(ctx.username, + index_name, + req.url, + delete_result.message); + } + if(delete_result.value_or_throw()) { return crow::response(200, "Vector deleted successfully"); } else { LOG_WARN(1046, ctx.username, index_name, "Delete-vector request for missing vector id " << vector_id); @@ -1113,10 +1190,25 @@ int main(int argc, char** argv) { "Filter must be an array. Please use format: " "[{\"field\":{\"$op\":value}}]"); } - size_t deleted_count = + auto delete_result = index_manager.deleteVectorsByFilter(index_id, filter_array); + if(!delete_result.ok()) { + if(operation_error_is_client_error(delete_result)) { + LOG_WARN(1073, + ctx.username, + index_name, + "Delete-by-filter request rejected: " << delete_result.message); + return json_error(400, delete_result.message); + } + return json_error_500(ctx.username, + index_name, + req.url, + delete_result.message); + } - return crow::response(200, std::to_string(deleted_count) + " vectors deleted"); + return crow::response(200, + std::to_string(delete_result.value_or_throw()) + + " vectors deleted"); } catch(const std::runtime_error& e) { LOG_WARN(1051, ctx.username, index_name, "Delete-by-filter request rejected: " << e.what()); return json_error(400, e.what()); @@ -1163,8 +1255,23 @@ int main(int argc, char** argv) { updates.emplace_back(id, filter); } - size_t count = index_manager.updateFilters(index_id, updates); - return crow::response(200, std::to_string(count) + " filters updated"); + auto update_result = index_manager.updateFilters(index_id, updates); + if(!update_result.ok()) { + if(operation_error_is_client_error(update_result)) { + LOG_WARN(1074, + ctx.username, + index_name, + "Update-filters request rejected: " << update_result.message); + return json_error(400, update_result.message); + } + return json_error_500(ctx.username, + index_name, + req.url, + update_result.message); + } + return crow::response(200, + std::to_string(update_result.value_or_throw()) + + " filters updated"); } catch(const std::runtime_error& e) { LOG_WARN(1054, ctx.username, index_name, "Update-filters request rejected: " << e.what()); diff --git a/src/storage/vector_storage.hpp b/src/storage/vector_storage.hpp index e5430be930..3318843c97 100644 --- a/src/storage/vector_storage.hpp +++ b/src/storage/vector_storage.hpp @@ -593,19 +593,30 @@ class VectorStorage { filter_store_ = std::make_unique(base_path + "/filters", index_id_); } VectorStore::Cursor getCursor() { return vector_store_->getCursor(); } - // Get numeric ids of matching filters - std::vector getIdsMatchingFilters( + /* + * Returns numeric ids matching legacy category filter pairs. + * + * Return codes: + * 0 = success + * 100-199 = propagated MDBX/storage failure from filter store + * 200-299 = propagated corruption/invariant failure from filter store + */ + ndd::OperationResult> getIdsMatchingFilters( const std::vector>& filter_pairs) const { - auto bitmap = filter_store_->combine_filters_and(filter_pairs); + auto bitmap_result = filter_store_->combine_filters_and(filter_pairs); + if(!bitmap_result.ok()) { + return {bitmap_result.code, bitmap_result.message}; + } + std::vector numeric_ids; - bitmap.iterate( + bitmap_result.value_or_throw().iterate( [](ndd::idInt value, void* ptr) -> bool { auto* ids = static_cast*>(ptr); ids->push_back(value); return true; }, &numeric_ids); - return numeric_ids; + return {SUCCESS, "", std::move(numeric_ids)}; } bool matches_filter(ndd::idInt numeric_id, @@ -641,7 +652,8 @@ class VectorStorage { } if(is_numeric_query) { - if(!filter_store_->check_numeric(field, numeric_id, op, val)) { + auto check_result = filter_store_->check_numeric(field, numeric_id, op, val); + if(!check_result.ok() || !check_result.value_or_throw()) { return false; } } else { @@ -716,11 +728,19 @@ class VectorStorage { } } - // Optimized batch operation using pre-quantized QuantVectorObject - // This avoids double quantization by using already quantized data - void store_vectors_batch(const std::vector>& vectors) { + /* + * Stores vectors, metadata, and associated filter documents for one pre-quantized batch. + * + * Return codes: + * 0 = success + * 1-99 = propagated filter validation failure from filter store + * 100-199 = propagated MDBX/storage failure from filter store + * 200-299 = propagated corruption/invariant failure from filter store + */ + ndd::OperationResult<> + store_vectors_batch(const std::vector>& vectors) { if(vectors.empty()) { - return; + return {SUCCESS, ""}; } // Prepare vector and meta batches @@ -758,8 +778,12 @@ class VectorStorage { // Process filter data in batch if any if(!filter_batch.empty()) { - filter_store_->add_filters_from_json_batch(filter_batch); + auto filter_result = filter_store_->add_filters_from_json_batch(filter_batch); + if(!filter_result.ok()) { + return filter_result; + } } + return {SUCCESS, ""}; } std::vector get_vector(ndd::idInt numeric_id) const { @@ -793,37 +817,69 @@ class VectorStorage { return meta_store_->get_meta(numeric_id); } - // NOT used anymore. Deletes filter, meta and vector data. - void deletePoint(ndd::idInt numeric_id) { + /* + * Deletes filter, metadata, and vector data for one numeric id. + * + * Return codes: + * 0 = success + * 1-99 = propagated filter validation failure from filter store + * 100-199 = propagated MDBX/storage failure from filter store + * 200-299 = propagated corruption/invariant failure from filter store + */ + ndd::OperationResult<> deletePoint(ndd::idInt numeric_id) { try { // Get metadata first to get filter info auto meta = meta_store_->get_meta(numeric_id); // Remove filter entries if they exist if(!meta.filter.empty()) { - filter_store_->remove_filters_from_json(numeric_id, meta.filter); + auto filter_result = filter_store_->remove_filters_from_json(numeric_id, meta.filter); + if(!filter_result.ok()) { + return filter_result; + } } // Try to remove both vector and meta data vector_store_->remove(numeric_id); meta_store_->remove(numeric_id); + return {SUCCESS, ""}; } catch(const std::exception& e) { - throw std::runtime_error(std::string("Failed to remove vector and metadata: ") - + e.what()); + return {100, std::string("Failed to remove vector and metadata: ") + e.what()}; } } - // Deletes filter only. - void deleteFilter(ndd::idInt numeric_id, std::string filter) { - filter_store_->remove_filters_from_json(numeric_id, filter); + + /* + * Deletes only filter index entries for one numeric id. + * + * Return codes: + * 0 = success + * 1-99 = propagated filter validation failure from filter store + * 100-199 = propagated MDBX/storage failure from filter store + * 200-299 = propagated corruption/invariant failure from filter store + */ + ndd::OperationResult<> deleteFilter(ndd::idInt numeric_id, std::string filter) { + return filter_store_->remove_filters_from_json(numeric_id, filter); } - // Update filter for a vector - void updateFilter(ndd::idInt numeric_id, const std::string& new_filter_json) { + /* + * Replaces the filter document for one vector. + * + * Return codes: + * 0 = success + * 1-99 = propagated filter validation failure from filter store + * 100-199 = propagated MDBX/storage failure from filter store + * 200-299 = propagated corruption/invariant failure from filter store + */ + ndd::OperationResult<> updateFilter(ndd::idInt numeric_id, + const std::string& new_filter_json) { // Get existing meta auto meta = meta_store_->get_meta(numeric_id); // Remove old filters if(!meta.filter.empty()) { - filter_store_->remove_filters_from_json(numeric_id, meta.filter); + auto remove_result = filter_store_->remove_filters_from_json(numeric_id, meta.filter); + if(!remove_result.ok()) { + return remove_result; + } } // Update meta @@ -832,8 +888,12 @@ class VectorStorage { // Add new filters if(!new_filter_json.empty()) { - filter_store_->add_filters_from_json(numeric_id, new_filter_json); + auto add_result = filter_store_->add_filters_from_json(numeric_id, new_filter_json); + if(!add_result.ok()) { + return add_result; + } } + return {SUCCESS, ""}; } ndd::quant::QuantizationLevel getQuantLevel() const { return vector_store_->getQuantLevel(); } diff --git a/src/utils/types.hpp b/src/utils/types.hpp index 431407b7e9..5e7cf2d88c 100644 --- a/src/utils/types.hpp +++ b/src/utils/types.hpp @@ -4,15 +4,19 @@ #include #include +#ifndef SUCCESS +# define SUCCESS 0 +#endif + namespace ndd { template struct OperationResult { - unsigned int code = 0; + unsigned int code = SUCCESS; std::string message; std::optional value; - bool ok() const { return code == 0; } + bool ok() const { return code == SUCCESS; } }; } // namespace ndd diff --git a/tests/filter_test.cpp b/tests/filter_test.cpp index 101be3403e..9568fc4d81 100644 --- a/tests/filter_test.cpp +++ b/tests/filter_test.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include "filter/filter.hpp" #include "json/nlohmann_json.hpp" @@ -9,6 +10,17 @@ namespace fs = std::filesystem; using json = nlohmann::json; +static void expect_ok(const ndd::OperationResult<>& result) { + EXPECT_TRUE(result.ok()) << result.message; +} + +template +static T unwrap_ok(ndd::OperationResult result) { + EXPECT_TRUE(result.ok()) << result.message; + EXPECT_TRUE(result.value.has_value()); + return std::move(*result.value); +} + TEST(BucketTest, Serialization) { ndd::filter::Bucket b; b.base_value = 100; @@ -55,16 +67,16 @@ TEST_F(FilterTest, CategoryFilterBasics) { // ID 2: City=London // ID 3: City=Paris - filter->add_to_filter("city", "Paris", 1); - filter->add_to_filter("city", "London", 2); - filter->add_to_filter("city", "Paris", 3); + expect_ok(filter->add_to_filter("city", "Paris", 1)); + expect_ok(filter->add_to_filter("city", "London", 2)); + expect_ok(filter->add_to_filter("city", "Paris", 3)); // Query for City=Paris json query = json::array({ {{"city", {{"$eq", "Paris"}}}} }); - std::vector ids = filter->getIdsMatchingFilter(query); + std::vector ids = unwrap_ok(filter->getIdsMatchingFilter(query)); // Should find 1 and 3 EXPECT_EQ(ids.size(), 2); @@ -79,15 +91,15 @@ TEST_F(FilterTest, BooleanFilterBasics) { // ID 11: Active=false // Using JSON add interface for variety - filter->add_filters_from_json(10, R"({"is_active": true})"); - filter->add_filters_from_json(11, R"({"is_active": false})"); + expect_ok(filter->add_filters_from_json(10, R"({"is_active": true})")); + expect_ok(filter->add_filters_from_json(11, R"({"is_active": false})")); // Query Active=true json query_true = json::array({ {{"is_active", {{"$eq", true}}}} }); - auto ids_true = filter->getIdsMatchingFilter(query_true); + auto ids_true = unwrap_ok(filter->getIdsMatchingFilter(query_true)); EXPECT_EQ(ids_true.size(), 1); EXPECT_EQ(ids_true[0], 10); @@ -96,7 +108,7 @@ TEST_F(FilterTest, BooleanFilterBasics) { {{"is_active", {{"$eq", false}}}} }); - auto ids_false = filter->getIdsMatchingFilter(query_false); + auto ids_false = unwrap_ok(filter->getIdsMatchingFilter(query_false)); EXPECT_EQ(ids_false.size(), 1); EXPECT_EQ(ids_false[0], 11); } @@ -106,16 +118,16 @@ TEST_F(FilterTest, NumericFilterBasics) { // ID 101: Age=30 // ID 102: Age=35 - filter->add_filters_from_json(100, R"({"age": 25})"); - filter->add_filters_from_json(101, R"({"age": 30})"); - filter->add_filters_from_json(102, R"({"age": 35})"); + expect_ok(filter->add_filters_from_json(100, R"({"age": 25})")); + expect_ok(filter->add_filters_from_json(101, R"({"age": 30})")); + expect_ok(filter->add_filters_from_json(102, R"({"age": 35})")); // Range Query: 20 <= Age <= 32 json query_range = json::array({ {{"age", {{"$range", {20, 32}}}}} }); - auto ids = filter->getIdsMatchingFilter(query_range); + auto ids = unwrap_ok(filter->getIdsMatchingFilter(query_range)); // Should match 100 (25) and 101 (30) EXPECT_EQ(ids.size(), 2); @@ -132,14 +144,14 @@ TEST_F(FilterTest, FloatNumericFilter) { // ID 1: Price=10.5 // ID 2: Price=20.0 - filter->add_filters_from_json(1, R"({"price": 10.5})"); - filter->add_filters_from_json(2, R"({"price": 20.0})"); + expect_ok(filter->add_filters_from_json(1, R"({"price": 10.5})")); + expect_ok(filter->add_filters_from_json(2, R"({"price": 20.0})")); json query = json::array({ {{"price", {{"$range", {10.0, 15.0}}}}} }); - auto ids = filter->getIdsMatchingFilter(query); + auto ids = unwrap_ok(filter->getIdsMatchingFilter(query)); EXPECT_EQ(ids.size(), 1); EXPECT_EQ(ids[0], 1); } @@ -149,9 +161,9 @@ TEST_F(FilterTest, MixedAndLogic) { // ID 2: City=NY, Age=40 (Age fail) // ID 3: City=LA, Age=30 (City fail) - filter->add_filters_from_json(1, R"({"city": "NY", "age": 30})"); - filter->add_filters_from_json(2, R"({"city": "NY", "age": 40})"); - filter->add_filters_from_json(3, R"({"city": "LA", "age": 30})"); + expect_ok(filter->add_filters_from_json(1, R"({"city": "NY", "age": 30})")); + expect_ok(filter->add_filters_from_json(2, R"({"city": "NY", "age": 40})")); + expect_ok(filter->add_filters_from_json(3, R"({"city": "LA", "age": 30})")); // Filter: City=NY AND Age < 35 json query = json::array({ @@ -159,7 +171,7 @@ TEST_F(FilterTest, MixedAndLogic) { {{"age", {{"$range", {0, 35}}}}} }); - auto ids = filter->getIdsMatchingFilter(query); + auto ids = unwrap_ok(filter->getIdsMatchingFilter(query)); EXPECT_EQ(ids.size(), 1); EXPECT_EQ(ids[0], 1); } @@ -169,51 +181,97 @@ TEST_F(FilterTest, InOperator) { // ID 2: Color=Blue // ID 3: Color=Green - filter->add_to_filter("color", "Red", 1); - filter->add_to_filter("color", "Blue", 2); - filter->add_to_filter("color", "Green", 3); + expect_ok(filter->add_to_filter("color", "Red", 1)); + expect_ok(filter->add_to_filter("color", "Blue", 2)); + expect_ok(filter->add_to_filter("color", "Green", 3)); // Query: Color IN [Red, Green] json query = json::array({ {{"color", {{"$in", {"Red", "Green"}}}}} }); - auto ids = filter->getIdsMatchingFilter(query); + auto ids = unwrap_ok(filter->getIdsMatchingFilter(query)); EXPECT_EQ(ids.size(), 2); // 1 and 3 } TEST_F(FilterTest, DeleteFilter) { // ID 1: Tag=A - filter->add_to_filter("tag", "A", 1); + expect_ok(filter->add_to_filter("tag", "A", 1)); json query = json::array({ {{"tag", {{"$eq", "A"}}}} }); - EXPECT_EQ(filter->countIdsMatchingFilter(query), 1); + EXPECT_EQ(unwrap_ok(filter->countIdsMatchingFilter(query)), 1); // Remove functionality test // Usually removal requires us to know what to remove or we remove entire ID? // The Filter class has: remove_from_filter(field, value, id) - filter->remove_from_filter("tag", "A", 1); + expect_ok(filter->remove_from_filter("tag", "A", 1)); - EXPECT_EQ(filter->countIdsMatchingFilter(query), 0); + EXPECT_EQ(unwrap_ok(filter->countIdsMatchingFilter(query)), 0); } TEST_F(FilterTest, NumericDelete) { // ID 1: Score=100 - filter->add_filters_from_json(1, R"({"score": 100})"); + expect_ok(filter->add_filters_from_json(1, R"({"score": 100})")); // Check it exists json query = json::array({ {{"score", {{"$eq", 100}}}} }); - EXPECT_EQ(filter->countIdsMatchingFilter(query), 1); + EXPECT_EQ(unwrap_ok(filter->countIdsMatchingFilter(query)), 1); // Remove // remove_filters_from_json uses the whole object - filter->remove_filters_from_json(1, R"({"score": 100})"); + expect_ok(filter->remove_filters_from_json(1, R"({"score": 100})")); - EXPECT_EQ(filter->countIdsMatchingFilter(query), 0); + EXPECT_EQ(unwrap_ok(filter->countIdsMatchingFilter(query)), 0); +} + +TEST_F(FilterTest, RejectsMalformedFilterJson) { + auto result = filter->add_filters_from_json(1, R"({"city": "Paris")"); + + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.code, 1); +} + +TEST_F(FilterTest, RejectsUnsupportedFilterType) { + auto result = filter->add_filters_from_json(1, R"({"tags": ["a", "b"]})"); + + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.code, 2); +} + +TEST_F(FilterTest, RejectsSchemaTypeMismatch) { + expect_ok(filter->add_filters_from_json(1, R"({"age": 30})")); + + auto result = filter->add_filters_from_json(2, R"({"age": "thirty"})"); + + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.code, 3); +} + +TEST_F(FilterTest, RejectsInvalidOperator) { + json query = json::array({ + {{"city", {{"$contains", "Paris"}}}} + }); + + auto result = filter->getIdsMatchingFilter(query); + + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.code, 2); +} + +TEST_F(FilterTest, RejectsInvalidRange) { + expect_ok(filter->add_filters_from_json(1, R"({"score": 100})")); + json query = json::array({ + {{"score", {{"$range", {200, 100}}}}} + }); + + auto result = filter->getIdsMatchingFilter(query); + + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.code, 2); } From 5d6ae776f025371da42b073141450734ff769de2 Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Mon, 4 May 2026 14:29:59 +0000 Subject: [PATCH 14/28] comments updated --- src/core/ndd.hpp | 16 ++++--- src/filter/category_index.hpp | 2 + src/filter/filter.hpp | 27 +++++++++-- src/filter/numeric_index.hpp | 88 +++++++++++++++++++++++++++++++++++ src/utils/types.hpp | 23 +++++++++ 5 files changed, 147 insertions(+), 9 deletions(-) diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index 20a1db2068..8a11747bb9 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -990,7 +990,7 @@ class IndexManager { entry->alg = std::move(new_alg); } - /* + /** * Adds or updates a batch of vectors and their associated filter documents. * * Return codes: @@ -1350,8 +1350,10 @@ class IndexManager { } } - /* + /** * Deletes vectors from id mapper, filter indexes, sparse storage, and HNSW live set. + * XXX: Does not delete meta, vector data Meta and vector data will be overwritten when + * the id is reused. * * Return codes: * 0 = success @@ -1409,7 +1411,7 @@ class IndexManager { } } - /* + /** * Deletes all vectors matching a filter query. * * Return codes: @@ -1471,7 +1473,7 @@ class IndexManager { } } - /* + /** * Replaces filter documents for a batch of vectors. * * Return codes: @@ -1527,8 +1529,10 @@ class IndexManager { } } - /* + /** * Deletes one vector by string id and removes its filter index entries. + * The meta and filter will be deleted and the vector will be marked as + * deleted in HNSW. The id will be put in the deleted_ids in id mapper and will be reused for new vectors. * * Return codes: * 0 = success; value is false when the vector id does not exist @@ -1571,7 +1575,7 @@ class IndexManager { } } - /* + /** * Searches an index with optional filter bitmap computation. * * Return codes: diff --git a/src/filter/category_index.hpp b/src/filter/category_index.hpp index 79d183882f..1d5c873758 100644 --- a/src/filter/category_index.hpp +++ b/src/filter/category_index.hpp @@ -124,6 +124,7 @@ namespace ndd { + mdbx_strerror(rc)); } + // Open named DB for category/boolean rc = mdbx_dbi_open(txn, "category_idx", MDBX_CREATE, &dbi_); if(rc != MDBX_SUCCESS) { mdbx_txn_abort(txn); @@ -295,6 +296,7 @@ namespace ndd { return store_bitmap_internal(key, bitmap_result.value_or_throw()); } + // Expose key formatting for external batching logic static std::string make_key(const std::string& field, const std::string& value) { return format_filter_key(field, value); } diff --git a/src/filter/filter.hpp b/src/filter/filter.hpp index 3870ff1dd5..aec64f3e3c 100644 --- a/src/filter/filter.hpp +++ b/src/filter/filter.hpp @@ -1,5 +1,6 @@ #pragma once +// System includes #include #include #include @@ -25,10 +26,11 @@ enum class FieldType : uint8_t { Unknown = 0, String = 1, - Number = 2, + Number = 2, // Unified Integer and Float Bool = 4 }; +// Filter Functor for HNSW class BitMapFilterFunctor : public hnswlib::BaseFilterFunctor { const ndd::RoaringBitmap& bitmap_; @@ -42,6 +44,7 @@ class BitMapFilterFunctor : public hnswlib::BaseFilterFunctor { class Filter { private: MDBX_env* env_; + // Used for schema storage MDBX_dbi dbi_; std::string index_id_; std::string path_; @@ -219,12 +222,14 @@ class Filter { + mdbx_strerror(rc)); } + // max DBs to allow multiple databases (main + schema + numeric_forward + numeric_inverted) rc = mdbx_env_set_maxdbs(env_, 10); if(rc != MDBX_SUCCESS) { throw std::runtime_error(std::string("Failed to configure max DBs for filters: ") + mdbx_strerror(rc)); } + // Set geometry for auto-grow using the filter map size settings rc = mdbx_env_set_geometry(env_, -1, 1ULL << settings::FILTER_MAP_SIZE_BITS, @@ -266,6 +271,7 @@ class Filter { + mdbx_strerror(rc)); } + // Initialize Indices numeric_index_ = std::make_unique(env_); category_index_ = std::make_unique(env_); @@ -329,6 +335,7 @@ class Filter { return {1, "Filter operator must be a single-field object"}; } + // Check schema for field type FieldType type = FieldType::Unknown; { std::lock_guard lock(schema_mutex_); @@ -433,6 +440,7 @@ class Filter { partial_results.push_back(std::move(or_result)); } + // Optimization: Sort by cardinality (smallest first) std::sort(partial_results.begin(), partial_results.end(), [](const ndd::RoaringBitmap& left, const ndd::RoaringBitmap& right) { @@ -446,6 +454,8 @@ class Filter { ndd::RoaringBitmap final_result = partial_results[0]; for(size_t i = 1; i < partial_results.size(); ++i) { final_result &= partial_results[i]; + + // If result becomes empty, stop early if(final_result.isEmpty()) { return {SUCCESS, "", std::move(final_result)}; } @@ -454,8 +464,8 @@ class Filter { return {SUCCESS, "", std::move(final_result)}; } - /* - * Returns numeric ids matching a filter query. + /** + * Returns numeric ids matching a filter query based on the provided JSON filter array * * Return codes: * 0 = success @@ -544,11 +554,13 @@ class Filter { return {SUCCESS, ""}; } + // Create a map to collect IDs for each label filter std::unordered_map> label_filter_to_ids; label_filter_to_ids.reserve(id_filter_pairs.size()); std::vector numeric_filter_entries; numeric_filter_entries.reserve(id_filter_pairs.size()); + // Group IDs by filter for(const auto& [numeric_id, filter_json] : id_filter_pairs) { nlohmann::json parsed; try { @@ -606,6 +618,13 @@ class Filter { } } + /** + * XXX: For transactional correctness of filter adds, all the filters + * should be added in a single transaction. + * For now, they are being added in two different transactions. + * one for numeric_index and other for labels. + */ + if(!numeric_filter_entries.empty()) { auto numeric_result = numeric_index_->put_batch(numeric_filter_entries); if(!numeric_result.ok()) { @@ -613,6 +632,7 @@ class Filter { } } + // Process each filter with its batch of IDs for(const auto& [filter_key, ids] : label_filter_to_ids) { auto add_result = add_to_filter_batch(filter_key, ids); if(!add_result.ok()) { @@ -702,6 +722,7 @@ class Filter { } remove_result = remove_from_filter(field, category_result.value_or_throw(), numeric_id); } else if(value.is_number()) { + // Remove from Numeric Index remove_result = numeric_index_->remove(field, numeric_id); } else if(value.is_boolean()) { remove_result = remove_from_filter(field, diff --git a/src/filter/numeric_index.hpp b/src/filter/numeric_index.hpp index c2206722d0..2d12f36739 100644 --- a/src/filter/numeric_index.hpp +++ b/src/filter/numeric_index.hpp @@ -235,6 +235,7 @@ namespace ndd { return field + ":" + std::to_string(id); } + // Key Format: [Field]:[BigEndian_BaseValue] std::string make_bucket_key(const std::string& field, uint32_t start_val) { uint32_t be_val = 0; #if defined(__GNUC__) || defined(__clang__) @@ -274,6 +275,7 @@ namespace ndd { const std::string& field, uint32_t value, ndd::idInt id) { + // Find bucket std::string bkey_str = make_bucket_key(field, value); MDBX_val key{const_cast(bkey_str.data()), bkey_str.size()}; MDBX_val data; @@ -284,17 +286,28 @@ namespace ndd { + std::string(mdbx_strerror(rc))}; } + /** + * Scan backward to find bucket covering 'value'. + * Logic to find correct bucket: + */ rc = mdbx_cursor_get(cursor, &key, &data, MDBX_SET_RANGE); if(rc == MDBX_SUCCESS) { + // Check if we are in right field & range std::string found_key(static_cast(key.iov_base), key.iov_len); if(found_key.rfind(field + ":", 0) != 0 || parse_bucket_key_val(found_key) > value) { rc = mdbx_cursor_get(cursor, &key, &data, MDBX_PREV); } } else if(rc == MDBX_NOTFOUND) { + /** + * The only possible bucket that could still contain + * value is the very last bucket in the database. + * Hence jumping there. + */ rc = mdbx_cursor_get(cursor, &key, &data, MDBX_LAST); } + // Should be at correct bucket now if(rc != MDBX_SUCCESS) { mdbx_cursor_close(cursor); if(rc == MDBX_NOTFOUND) { @@ -319,6 +332,7 @@ namespace ndd { try { Bucket bucket = Bucket::deserialize(data.iov_base, data.iov_len, bucket_base); if(bucket.remove(id)) { + // Save back or Delete if empty if(bucket.is_empty()) { rc = mdbx_cursor_del(cursor, static_cast(0)); if(rc != MDBX_SUCCESS) { @@ -366,10 +380,12 @@ namespace ndd { + std::string(mdbx_strerror(rc))}; } + // Find candidate bucket std::string search_key = make_bucket_key(field, value); MDBX_val key{const_cast(search_key.data()), search_key.size()}; MDBX_val data; + // Move logic to find predecessor rc = mdbx_cursor_get(cursor, &key, &data, MDBX_SET_RANGE); if(rc == MDBX_SUCCESS) { std::string found_key(static_cast(key.iov_base), key.iov_len); @@ -406,6 +422,8 @@ namespace ndd { std::string found_key(static_cast(key.iov_base), key.iov_len); if(found_key.rfind(field + ":", 0) == 0) { target_base = parse_bucket_key_val(found_key); + + // Check range condition if(value >= target_base && (static_cast(value) - target_base) <= Bucket::MAX_DELTA) { @@ -417,6 +435,7 @@ namespace ndd { try { if(create_new) { + // Create new bucket at exact value Bucket bucket; bucket.base_value = value; bucket.add(value, id); @@ -433,18 +452,30 @@ namespace ndd { + std::string(mdbx_strerror(rc))}; } } else { + /** + * Update existing. + * We must re-fetch current key/data because cursor move might have updated key/data. + */ MDBX_val k{const_cast(target_key_str.data()), target_key_str.size()}; MDBX_val v; rc = mdbx_cursor_get(cursor, &k, &v, MDBX_SET); if(rc != MDBX_SUCCESS) { + // Should not happen if logic is correct mdbx_cursor_close(cursor); return {200, "Failed to resync numeric bucket cursor: " + std::string(mdbx_strerror(rc))}; } Bucket bucket = Bucket::deserialize(v.iov_base, v.iov_len, target_base); + // Capacity Check if(bucket.ids.size() >= Bucket::MAX_SIZE) { + /** + * SPLIT LOGIC + * Sort is maintained by arrays. + * "Slide Split": Scan right from median. + * Ensure we don't split a group of identical values. + */ size_t mid_idx = bucket.ids.size() / 2; size_t probe_right = mid_idx; while(probe_right < bucket.deltas.size() && probe_right > 0 @@ -456,16 +487,23 @@ namespace ndd { if(probe_right < bucket.deltas.size()) { mid_idx = probe_right; } else { + // Fallback: Try scanning left size_t probe_left = mid_idx; while(probe_left > 0 && bucket.deltas[probe_left] == bucket.deltas[probe_left - 1]) { probe_left--; } + // All identical mid_idx = probe_left > 0 ? probe_left : bucket.deltas.size(); } + // If we hit end, we can't split by value uniqueness if(mid_idx == bucket.deltas.size()) { + /** + * Fallback: Just append (overfill) or implement logic to handle identicals. + * For now: Append. + */ bucket.add(value, id); auto bytes = bucket.serialize(); MDBX_val k2{const_cast(target_key_str.data()), @@ -480,26 +518,43 @@ namespace ndd { return {SUCCESS, ""}; } + // Standard Slide Split Bucket right_bucket; right_bucket.base_value = bucket.base_value + bucket.deltas[mid_idx]; + // Move entries for(size_t i = mid_idx; i < bucket.deltas.size(); ++i) { right_bucket.add(bucket.base_value + bucket.deltas[i], bucket.ids[i]); } + // Truncate left bucket.deltas.resize(mid_idx); bucket.ids.resize(mid_idx); + // Rebuild left bitmap bucket.summary_bitmap = ndd::RoaringBitmap(); for(auto bucket_id : bucket.ids) { bucket.summary_bitmap.add(bucket_id); } + // Now add new value to correct bucket if(value >= right_bucket.base_value) { right_bucket.add(value, id); } else { + /** + * If value < right, goes to left. + * But wait, split point was determined by existing items. + * If new value is >= base+split_delta, it goes right. + * BUT we just cleared right from b. + * Correct logic: + * Oh wait, if we added to left, we might overflow again or break order? + * Simply: Check which bucket covers it. + * Left covers [Base, RightBase-1]. + * Right covers [RightBase, ...]. + */ bucket.add(value, id); } + // Save Left auto left_bytes = bucket.serialize(); MDBX_val left_v{left_bytes.data(), left_bytes.size()}; MDBX_val left_k{const_cast(target_key_str.data()), @@ -511,12 +566,14 @@ namespace ndd { + std::string(mdbx_strerror(rc))}; } + // Save Right auto right_bytes = right_bucket.serialize(); std::string right_k_str = make_bucket_key(field, right_bucket.base_value); MDBX_val right_k{const_cast(right_k_str.data()), right_k_str.size()}; MDBX_val right_v{right_bytes.data(), right_bytes.size()}; + // Use put for new key rc = mdbx_put(txn, inverted_dbi_, &right_k, &right_v, MDBX_UPSERT); if(rc != MDBX_SUCCESS) { mdbx_cursor_close(cursor); @@ -524,9 +581,11 @@ namespace ndd { + std::string(mdbx_strerror(rc))}; } } else { + // Normal Insert bucket.add(value, id); auto bytes = bucket.serialize(); MDBX_val new_data{bytes.data(), bytes.size()}; + // Use cursor put to update current rc = mdbx_cursor_put(cursor, &k, &new_data, MDBX_CURRENT); if(rc != MDBX_SUCCESS) { mdbx_cursor_close(cursor); @@ -559,6 +618,8 @@ namespace ndd { const std::string& field, ndd::idInt id, uint32_t value) { + + // 1. Check Forward Index std::string fwd_key_str = make_forward_key(field, id); MDBX_val fwd_key{const_cast(fwd_key_str.data()), fwd_key_str.size()}; MDBX_val fwd_val; @@ -582,6 +643,7 @@ namespace ndd { + std::string(mdbx_strerror(rc))}; } + // 2. Update Forward MDBX_val new_val_data{&value, sizeof(uint32_t)}; rc = mdbx_put(txn, forward_dbi_, &fwd_key, &new_val_data, MDBX_UPSERT); if(rc != MDBX_SUCCESS) { @@ -589,6 +651,7 @@ namespace ndd { + std::string(mdbx_strerror(rc))}; } + // 3. Add to Inverted Buckets return add_to_buckets(txn, field, value, id); } @@ -746,6 +809,7 @@ namespace ndd { + std::string(mdbx_strerror(rc))}; } + // 1. Find Start Bucket std::string start_k = make_bucket_key(field, min_val); MDBX_val key{const_cast(start_k.data()), start_k.size()}; MDBX_val data; @@ -754,13 +818,16 @@ namespace ndd { if(rc == MDBX_SUCCESS) { std::string fkey(static_cast(key.iov_base), key.iov_len); if(fkey.rfind(field + ":", 0) != 0 || parse_bucket_key_val(fkey) > min_val) { + // Check if we need to back up MDBX_val prev_key = key; MDBX_val prev_data; + // Check prev int prev_rc = mdbx_cursor_get(cursor, &prev_key, &prev_data, MDBX_PREV); if(prev_rc == MDBX_SUCCESS) { std::string prev_key_str(static_cast(prev_key.iov_base), prev_key.iov_len); if(prev_key_str.rfind(field + ":", 0) == 0) { + // Prev is valid start key = prev_key; data = prev_data; } @@ -792,6 +859,7 @@ namespace ndd { } try { + // Iterate forward while(rc == MDBX_SUCCESS) { std::string cur_key(static_cast(key.iov_base), key.iov_len); if(cur_key.rfind(field + ":", 0) != 0) { @@ -803,6 +871,24 @@ namespace ndd { break; } + /** + * Peek Strategy: + * If bucket_base >= min_val, we know the start is covered. + * If we could know NEXT bucket start, we'd know overlap. + * Since we iterate, we can be greedy on read. + * + * For now, always deserialize. + * Potential optimization: Read only bitmap if we are "deep" in the range. + * e.g. min_val=10, max_val=100. Bucket=20. + * If bucket=20. Next Bucket=30. + * Then Bucket 20 covers [20..30). + * Range [10..100] covers [20..30] fully. + * So we need lookahead. + * + * Simple logic without lookahead: + * Just read full bucket. It's 8KB max (2 pages). + * It's fast unless we have millions of buckets. + */ Bucket bucket = Bucket::deserialize(data.iov_base, data.iov_len, bucket_base); @@ -811,8 +897,10 @@ namespace ndd { uint32_t bucket_max = bucket.get_value(bucket.ids.size() - 1); if(bucket_min >= min_val && bucket_max <= max_val) { + // Full overlap result |= bucket.summary_bitmap; } else { + // Partial overlap for(size_t i = 0; i < bucket.ids.size(); ++i) { uint32_t value = bucket.get_value(i); if(value >= min_val && value <= max_val) { diff --git a/src/utils/types.hpp b/src/utils/types.hpp index 5e7cf2d88c..45dc18f572 100644 --- a/src/utils/types.hpp +++ b/src/utils/types.hpp @@ -1,7 +1,9 @@ #pragma once #include +#include #include +#include #include #ifndef SUCCESS @@ -17,6 +19,27 @@ struct OperationResult { std::optional value; bool ok() const { return code == SUCCESS; } + + T& value_or_throw() & { + if(!ok() || !value.has_value()) { + throw std::logic_error("OperationResult success value is not available: " + message); + } + return *value; + } + + const T& value_or_throw() const& { + if(!ok() || !value.has_value()) { + throw std::logic_error("OperationResult success value is not available: " + message); + } + return *value; + } + + T&& value_or_throw() && { + if(!ok() || !value.has_value()) { + throw std::logic_error("OperationResult success value is not available: " + message); + } + return std::move(*value); + } }; } // namespace ndd From 9601ac58db83fcaf18dec4fb3fbe8867e61f2b4b Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Wed, 6 May 2026 11:09:24 +0000 Subject: [PATCH 15/28] filter adding gt, gte, lt, lte --- src/filter/filter.hpp | 104 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/src/filter/filter.hpp b/src/filter/filter.hpp index aec64f3e3c..acf7f974b0 100644 --- a/src/filter/filter.hpp +++ b/src/filter/filter.hpp @@ -2,8 +2,11 @@ // System includes #include +#include +#include #include #include +#include #include #include #include @@ -215,6 +218,79 @@ class Filter { return field + ":" + value; } + /* + * Resolves [$lt | $lte | $gt | $gte] on a JSON numeric value into a + * sortable [min, max] range usable against NumericIndex::range / check_range. + * A returned pair with min > max signals a provably-empty range + * (e.g. $gt INT32_MAX, $lt the smallest float); callers must skip the lookup. + * + * Return codes: + * 0 = success + * 2 = value is not a finite number, or operator is not a numeric comparison; + * caller should return HTTP 400 + */ + static ndd::OperationResult> + numeric_bound_from_comparison(const std::string& op, const nlohmann::json& val) { + using Bound = std::pair; + constexpr uint32_t SORTABLE_MIN = 0x00000000u; + constexpr uint32_t SORTABLE_MAX = 0xFFFFFFFFu; + const Bound EMPTY{SORTABLE_MAX, SORTABLE_MIN}; + + if(!val.is_number()) { + return {2, op + " value must be a finite number"}; + } + if(!val.is_number_integer() && !std::isfinite(val.get())) { + return {2, op + " value must be a finite number"}; + } + + if(op == "$gte") { + auto sortable_result = sortable_from_json(val, op + " value"); + if(!sortable_result.ok()) { + return {sortable_result.code, sortable_result.message}; + } + return {SUCCESS, "", Bound{sortable_result.value_or_throw(), SORTABLE_MAX}}; + } + if(op == "$lte") { + auto sortable_result = sortable_from_json(val, op + " value"); + if(!sortable_result.ok()) { + return {sortable_result.code, sortable_result.message}; + } + return {SUCCESS, "", Bound{SORTABLE_MIN, sortable_result.value_or_throw()}}; + } + if(op == "$gt") { + if(val.is_number_integer()) { + int32_t x = val.get(); + if(x == std::numeric_limits::max()) { + return {SUCCESS, "", EMPTY}; + } + return {SUCCESS, "", Bound{ndd::filter::int_to_sortable(x + 1), SORTABLE_MAX}}; + } + float x = val.get(); + float next = std::nextafterf(x, std::numeric_limits::infinity()); + if(!std::isfinite(next)) { + return {SUCCESS, "", EMPTY}; + } + return {SUCCESS, "", Bound{ndd::filter::float_to_sortable(next), SORTABLE_MAX}}; + } + if(op == "$lt") { + if(val.is_number_integer()) { + int32_t x = val.get(); + if(x == std::numeric_limits::min()) { + return {SUCCESS, "", EMPTY}; + } + return {SUCCESS, "", Bound{SORTABLE_MIN, ndd::filter::int_to_sortable(x - 1)}}; + } + float x = val.get(); + float next = std::nextafterf(x, -std::numeric_limits::infinity()); + if(!std::isfinite(next)) { + return {SUCCESS, "", EMPTY}; + } + return {SUCCESS, "", Bound{SORTABLE_MIN, ndd::filter::float_to_sortable(next)}}; + } + + return {2, "Unsupported numeric comparison operator: " + op}; + } + void init_environment() { int rc = mdbx_env_create(&env_); if(rc != MDBX_SUCCESS) { @@ -433,6 +509,22 @@ class Filter { return {range_result.code, range_result.message}; } or_result = std::move(range_result.value_or_throw()); + } else if(op == "$lt" || op == "$lte" || op == "$gt" || op == "$gte") { + if(type != FieldType::Number) { + return {2, op + " operator is only supported for numeric fields"}; + } + auto bound_result = numeric_bound_from_comparison(op, val); + if(!bound_result.ok()) { + return {bound_result.code, bound_result.message}; + } + auto [min_val, max_val] = bound_result.value_or_throw(); + if(min_val <= max_val) { + auto range_result = numeric_index_->range(field, min_val, max_val); + if(!range_result.ok()) { + return {range_result.code, range_result.message}; + } + or_result = std::move(range_result.value_or_throw()); + } } else { return {2, "Unsupported filter operator: " + op}; } @@ -856,6 +948,18 @@ class Filter { return numeric_index_->check_range(field, id, start_result.value_or_throw(), end_result.value_or_throw()); } + if(op == "$lt" || op == "$lte" || op == "$gt" || op == "$gte") { + auto bound_result = numeric_bound_from_comparison(op, val); + if(!bound_result.ok()) { + return {bound_result.code, bound_result.message}; + } + auto [min_val, max_val] = bound_result.value_or_throw(); + if(min_val > max_val) { + return {SUCCESS, "", false}; + } + return numeric_index_->check_range(field, id, min_val, max_val); + } + return {2, "Unsupported numeric operator: " + op}; } }; From bba8e6a68975d6073031be9d8653814de6f47ba9 Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Wed, 6 May 2026 11:37:43 +0000 Subject: [PATCH 16/28] reject filters with : in key or value --- src/filter/filter.hpp | 53 ++++++++++++ tests/filter_test.cpp | 192 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 245 insertions(+) diff --git a/src/filter/filter.hpp b/src/filter/filter.hpp index acf7f974b0..74a7d95761 100644 --- a/src/filter/filter.hpp +++ b/src/filter/filter.hpp @@ -211,9 +211,26 @@ class Filter { if(str_val.size() > 255) { return {2, context + " is too long"}; } + auto delim_check = validate_filter_key_component(str_val, context); + if(!delim_check.ok()) { + return {delim_check.code, delim_check.message}; + } return {SUCCESS, "", std::move(str_val)}; } + // Rejects ':' because it is the MDBX key delimiter for category and numeric + // indexes (see format_filter_key, NumericIndex::make_*_key). Allowing ':' in + // user input causes byte-level key collisions across distinct (field, value) + // pairs. + static ndd::OperationResult<> + validate_filter_key_component(const std::string& component, + const std::string& context) { + if(component.find(':') != std::string::npos) { + return {1, context + " must not contain ':'"}; + } + return {SUCCESS, ""}; + } + static std::string format_filter_key(const std::string& field, const std::string& value) { return field + ":" + value; } @@ -407,6 +424,10 @@ class Filter { if(field.empty()) { return {1, "Filter field name cannot be empty"}; } + auto field_check = validate_filter_key_component(field, "Filter field name"); + if(!field_check.ok()) { + return {field_check.code, field_check.message}; + } if(!expr.is_object() || expr.size() != 1) { return {1, "Filter operator must be a single-field object"}; } @@ -610,6 +631,14 @@ class Filter { */ ndd::OperationResult<> add_to_filter(const std::string& field, const std::string& value, ndd::idInt numeric_id) { + auto field_check = validate_filter_key_component(field, "Filter field name"); + if(!field_check.ok()) { + return field_check; + } + auto value_check = validate_filter_key_component(value, "Filter value"); + if(!value_check.ok()) { + return value_check; + } return category_index_->add(field, value, numeric_id); } @@ -669,6 +698,10 @@ class Filter { if(field.empty()) { return {1, "Filter field name cannot be empty"}; } + auto field_check = validate_filter_key_component(field, "Filter field name"); + if(!field_check.ok()) { + return {field_check.code, field_check.message}; + } FieldType type = FieldType::Unknown; if(value.is_boolean()) { @@ -747,6 +780,14 @@ class Filter { remove_from_filter(const std::string& field, const std::string& value, ndd::idInt numeric_id) { + auto field_check = validate_filter_key_component(field, "Filter field name"); + if(!field_check.ok()) { + return field_check; + } + auto value_check = validate_filter_key_component(value, "Filter value"); + if(!value_check.ok()) { + return value_check; + } return category_index_->remove(field, value, numeric_id); } @@ -760,6 +801,14 @@ class Filter { */ ndd::OperationResult contains(const std::string& field, const std::string& value, ndd::idInt numeric_id) const { + auto field_check = validate_filter_key_component(field, "Filter field name"); + if(!field_check.ok()) { + return {field_check.code, field_check.message}; + } + auto value_check = validate_filter_key_component(value, "Filter value"); + if(!value_check.ok()) { + return {value_check.code, value_check.message}; + } return category_index_->contains(field, value, numeric_id); } @@ -804,6 +853,10 @@ class Filter { if(field.empty()) { return {1, "Filter field name cannot be empty"}; } + auto field_check = validate_filter_key_component(field, "Filter field name"); + if(!field_check.ok()) { + return {field_check.code, field_check.message}; + } ndd::OperationResult<> remove_result{SUCCESS, ""}; if(value.is_string()) { diff --git a/tests/filter_test.cpp b/tests/filter_test.cpp index 9568fc4d81..bb0d023d87 100644 --- a/tests/filter_test.cpp +++ b/tests/filter_test.cpp @@ -1,4 +1,6 @@ #include +#include +#include #include #include #include @@ -275,3 +277,193 @@ TEST_F(FilterTest, RejectsInvalidRange) { EXPECT_FALSE(result.ok()); EXPECT_EQ(result.code, 2); } + +TEST_F(FilterTest, RejectsColonInFieldNameOnInsert) { + auto result = filter->add_filters_from_json(1, R"({"user:id": "x"})"); + + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.code, 1); +} + +TEST_F(FilterTest, RejectsColonInValueOnInsert) { + auto result = filter->add_filters_from_json(1, R"({"city": "Paris:France"})"); + + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.code, 1); +} + +TEST_F(FilterTest, RejectsColonInFieldNameOnQuery) { + json query = json::array({ + {{"user:id", {{"$eq", "x"}}}} + }); + + auto result = filter->getIdsMatchingFilter(query); + + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.code, 1); +} + +TEST_F(FilterTest, RejectsColonInValueOnQuery) { + expect_ok(filter->add_filters_from_json(1, R"({"city": "Paris"})")); + json query = json::array({ + {{"city", {{"$eq", "Paris:France"}}}} + }); + + auto result = filter->getIdsMatchingFilter(query); + + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.code, 1); +} + +TEST_F(FilterTest, RejectsColonInLowLevelAddToFilter) { + auto field_result = filter->add_to_filter("user:id", "x", 1); + EXPECT_FALSE(field_result.ok()); + EXPECT_EQ(field_result.code, 1); + + auto value_result = filter->add_to_filter("user", "x:y", 1); + EXPECT_FALSE(value_result.ok()); + EXPECT_EQ(value_result.code, 1); +} + +namespace { + +std::vector sorted_ids(std::vector ids) { + std::sort(ids.begin(), ids.end()); + return ids; +} + +} // namespace + +TEST_F(FilterTest, ComparisonOperatorsInteger) { + expect_ok(filter->add_filters_from_json(1, R"({"age": 25})")); + expect_ok(filter->add_filters_from_json(2, R"({"age": 30})")); + expect_ok(filter->add_filters_from_json(3, R"({"age": 35})")); + + auto run = [&](const json& expr) { + json query = json::array({{{"age", expr}}}); + return sorted_ids(unwrap_ok(filter->getIdsMatchingFilter(query))); + }; + + EXPECT_EQ(run({{"$lt", 30}}), (std::vector{1})); + EXPECT_EQ(run({{"$lte", 30}}), (std::vector{1, 2})); + EXPECT_EQ(run({{"$gt", 30}}), (std::vector{3})); + EXPECT_EQ(run({{"$gte", 30}}), (std::vector{2, 3})); +} + +TEST_F(FilterTest, ComparisonOperatorsFloat) { + expect_ok(filter->add_filters_from_json(1, R"({"price": 10.5})")); + expect_ok(filter->add_filters_from_json(2, R"({"price": 20.0})")); + expect_ok(filter->add_filters_from_json(3, R"({"price": 20.5})")); + + auto run = [&](const json& expr) { + json query = json::array({{{"price", expr}}}); + return sorted_ids(unwrap_ok(filter->getIdsMatchingFilter(query))); + }; + + EXPECT_EQ(run({{"$lt", 20.0}}), (std::vector{1})); + EXPECT_EQ(run({{"$lte", 20.0}}), (std::vector{1, 2})); + EXPECT_EQ(run({{"$gt", 20.0}}), (std::vector{3})); + EXPECT_EQ(run({{"$gte", 20.0}}), (std::vector{2, 3})); +} + +TEST_F(FilterTest, ComparisonOperatorsNegativeAndZero) { + expect_ok(filter->add_filters_from_json(1, R"({"temp": -5})")); + expect_ok(filter->add_filters_from_json(2, R"({"temp": 0})")); + expect_ok(filter->add_filters_from_json(3, R"({"temp": 5})")); + + auto run = [&](const json& expr) { + json query = json::array({{{"temp", expr}}}); + return sorted_ids(unwrap_ok(filter->getIdsMatchingFilter(query))); + }; + + EXPECT_EQ(run({{"$lt", 0}}), (std::vector{1})); + EXPECT_EQ(run({{"$gte", 0}}), (std::vector{2, 3})); + EXPECT_EQ(run({{"$lt", -5}}), (std::vector{})); + EXPECT_EQ(run({{"$lte", -5}}), (std::vector{1})); +} + +TEST_F(FilterTest, ComparisonAndCombination) { + expect_ok(filter->add_filters_from_json(1, R"({"city": "NY", "age": 25})")); + expect_ok(filter->add_filters_from_json(2, R"({"city": "NY", "age": 30})")); + expect_ok(filter->add_filters_from_json(3, R"({"city": "NY", "age": 35})")); + expect_ok(filter->add_filters_from_json(4, R"({"city": "LA", "age": 30})")); + + json query = json::array({ + {{"city", {{"$eq", "NY"}}}}, + {{"age", {{"$gte", 25}}}}, + {{"age", {{"$lt", 35}}}} + }); + + auto ids = sorted_ids(unwrap_ok(filter->getIdsMatchingFilter(query))); + EXPECT_EQ(ids, (std::vector{1, 2})); +} + +TEST_F(FilterTest, ComparisonInteractionWithIn) { + expect_ok(filter->add_filters_from_json(1, R"({"score": 1})")); + expect_ok(filter->add_filters_from_json(2, R"({"score": 5})")); + expect_ok(filter->add_filters_from_json(3, R"({"score": 10})")); + + json query = json::array({ + {{"score", {{"$in", {1, 5, 10}}}}}, + {{"score", {{"$gte", 5}}}} + }); + + auto ids = sorted_ids(unwrap_ok(filter->getIdsMatchingFilter(query))); + EXPECT_EQ(ids, (std::vector{2, 3})); +} + +TEST_F(FilterTest, ComparisonInteractionWithRange) { + for(int i = 10; i <= 50; i += 10) { + std::string body = R"({"v": )" + std::to_string(i) + "}"; + expect_ok(filter->add_filters_from_json(i, body)); + } + + json query = json::array({ + {{"v", {{"$range", {10, 50}}}}}, + {{"v", {{"$gt", 20}}}}, + {{"v", {{"$lte", 30}}}} + }); + + auto ids = sorted_ids(unwrap_ok(filter->getIdsMatchingFilter(query))); + EXPECT_EQ(ids, (std::vector{30})); +} + +TEST_F(FilterTest, ComparisonRejectsNonNumericValue) { + expect_ok(filter->add_filters_from_json(1, R"({"age": 25})")); + json query = json::array({ + {{"age", {{"$gt", "old"}}}} + }); + + auto result = filter->getIdsMatchingFilter(query); + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.code, 2); +} + +TEST_F(FilterTest, ComparisonRejectsOnNonNumericField) { + expect_ok(filter->add_filters_from_json(1, R"({"city": "NY"})")); + json query = json::array({ + {{"city", {{"$gt", 5}}}} + }); + + auto result = filter->getIdsMatchingFilter(query); + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.code, 2); +} + +TEST_F(FilterTest, ComparisonEmptyRangeAtIntegerBoundary) { + expect_ok(filter->add_filters_from_json(1, R"({"x": 0})")); + + json q_lt_min = json::array({ + {{"x", {{"$lt", INT_MIN}}}} + }); + auto r1 = filter->getIdsMatchingFilter(q_lt_min); + EXPECT_TRUE(r1.ok()) << r1.message; + EXPECT_EQ(r1.value_or_throw().size(), 0u); + + json q_gt_max = json::array({ + {{"x", {{"$gt", INT_MAX}}}} + }); + auto r2 = filter->getIdsMatchingFilter(q_gt_max); + EXPECT_TRUE(r2.ok()) << r2.message; + EXPECT_EQ(r2.value_or_throw().size(), 0u); +} From ecbee5df5d6e639a5bb61e30e1b4ddd6f2a723fb Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Thu, 7 May 2026 04:02:03 +0000 Subject: [PATCH 17/28] do meta data fetch only for vectors that satisfy the filters --- src/core/ndd.hpp | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index 8a11747bb9..f201e72cfd 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -1835,17 +1835,27 @@ class IndexManager { results.reserve(final_candidates.size()); LOG_DEBUG("Search results size: " << final_candidates.size()); - // Process and filter results + // Postfilter strategy: + // Every code path that feeds final_candidates already enforces filter_ptr: + // - Filtered HNSW search drops ids via BitMapFilterFunctor (filter.hpp). + // - Prefilter brute-force only iterates ids drawn from the bitmap. + // - Sparse search drops non-matching ids inside its scoring phase + // (inverted_index.cpp). + // So on the dense-only path the per-result contains() check is dead and + // we skip it. On the hybrid path we keep it as defense-in-depth in case + // sparse search ever stops honoring the filter; either way the check now + // runs before get_meta() so a (defensive) reject does not pay an MDBX read. + const bool postfilter_active = filter_ptr != nullptr && run_sparse_search; + size_t postfilter_drops = 0; size_t filtered_count = 0; for(const auto& p : final_candidates) { - // Get metadata - ndd::VectorMeta meta = entry.vector_storage->get_meta(p.second); - - // Apply filter - if(filter_ptr && !filter_ptr->contains(p.second)) { + if(postfilter_active && !filter_ptr->contains(p.second)) { + ++postfilter_drops; continue; } + ndd::VectorMeta meta = entry.vector_storage->get_meta(p.second); + ndd::VectorResult result; result.id = meta.id; result.filter = meta.filter; @@ -1878,6 +1888,15 @@ class IndexManager { if(results.size() > k) { results.resize(k); } + + // A drop here means an upstream filter step failed to honor filter_ptr. + // Log once per request rather than per-result to respect the hot-loop rule. + if(postfilter_drops > 0) { + LOG_WARN(1222, + index_id, + "Postfilter dropped " << postfilter_drops + << " ids that bypassed upstream filter checks"); + } return {SUCCESS, "", std::move(results)}; } catch(const std::exception& e) { LOG_ERROR(2039, index_id, "Search failed: " << e.what()); From 4db88aac77d960ccda86d18d2c9437d43bca4e27 Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Tue, 12 May 2026 08:44:50 +0000 Subject: [PATCH 18/28] safe filter bitmap deserialization --- src/filter/category_index.hpp | 54 ++- src/filter/numeric_index.hpp | 135 +++++- tests/filter_test.cpp | 787 ++++++++++++++++++++++++++++++++++ 3 files changed, 957 insertions(+), 19 deletions(-) diff --git a/src/filter/category_index.hpp b/src/filter/category_index.hpp index 1d5c873758..1fa8bca606 100644 --- a/src/filter/category_index.hpp +++ b/src/filter/category_index.hpp @@ -22,6 +22,42 @@ namespace ndd { return field + ":" + value; } + static ndd::OperationResult + read_bitmap_payload(const void* data, size_t len) { + if(data == nullptr || len == 0) { + return {200, "empty bitmap payload"}; + } + + const char* bytes = static_cast(data); + const size_t consumed = + roaring::api::roaring_bitmap_portable_deserialize_size(bytes, len); + if(consumed == 0) { + return {200, "invalid or truncated bitmap payload"}; + } + if(consumed != len) { + return {200, + "bitmap payload length mismatch: consumed " + + std::to_string(consumed) + " of " + + std::to_string(len) + " bytes"}; + } + + ndd::RoaringBitmap bitmap; + try { + bitmap = ndd::RoaringBitmap::readSafe(bytes, len); + } catch(const std::exception& e) { + return {200, + "failed to deserialize bitmap payload: " + std::string(e.what())}; + } + + const char* reason = nullptr; + if(!roaring::api::roaring_bitmap_internal_validate(&bitmap.roaring, &reason)) { + return {200, + std::string("invalid bitmap internals") + + (reason != nullptr ? ": " + std::string(reason) : "")}; + } + return {SUCCESS, "", std::move(bitmap)}; + } + /* * Loads the bitmap stored for a formatted category filter key. * @@ -55,16 +91,20 @@ namespace ndd { + "': " + std::string(mdbx_strerror(rc))}; } - try { - ndd::RoaringBitmap bitmap = - ndd::RoaringBitmap::read(static_cast(data.iov_base)); + auto bitmap_result = read_bitmap_payload(data.iov_base, data.iov_len); + if(!bitmap_result.ok()) { mdbx_txn_abort(txn); - return {SUCCESS, "", std::move(bitmap)}; - } catch(const std::exception& e) { + return {bitmap_result.code, + "Corrupt category bitmap payload for key '" + filter_key + + "': " + bitmap_result.message}; + } + if(!bitmap_result.value.has_value()) { mdbx_txn_abort(txn); - return {200, "Corrupt category bitmap payload for key '" + filter_key - + "': " + e.what()}; + return {200, "Category bitmap reader succeeded without a bitmap for key '" + + filter_key + "'"}; } + mdbx_txn_abort(txn); + return {SUCCESS, "", std::move(*bitmap_result.value)}; } /* diff --git a/src/filter/numeric_index.hpp b/src/filter/numeric_index.hpp index 2d12f36739..a2c52a0011 100644 --- a/src/filter/numeric_index.hpp +++ b/src/filter/numeric_index.hpp @@ -76,6 +76,46 @@ namespace ndd { bool is_dirty = false; + static ndd::OperationResult + read_bitmap_payload(const uint8_t* data, size_t len) { + if(len == 0) { + return {SUCCESS, "", ndd::RoaringBitmap()}; + } + if(data == nullptr) { + return {200, "empty bitmap payload"}; + } + + const char* bytes = reinterpret_cast(data); + const size_t consumed = + roaring::api::roaring_bitmap_portable_deserialize_size(bytes, len); + if(consumed == 0) { + return {200, "invalid or truncated bitmap payload"}; + } + if(consumed != len) { + return {200, + "bitmap payload length mismatch: consumed " + + std::to_string(consumed) + " of " + + std::to_string(len) + " bytes"}; + } + + ndd::RoaringBitmap bitmap; + try { + bitmap = ndd::RoaringBitmap::readSafe(bytes, len); + } catch(const std::exception& e) { + return {200, + "failed to deserialize bitmap payload: " + std::string(e.what())}; + } + + const char* reason = nullptr; + if(!roaring::api::roaring_bitmap_internal_validate(&bitmap.roaring, + &reason)) { + return {200, + std::string("invalid bitmap internals") + + (reason != nullptr ? ": " + std::string(reason) : "")}; + } + return {SUCCESS, "", std::move(bitmap)}; + } + // Helper to get actual value uint32_t get_value(size_t index) const { return base_value + deltas[index]; @@ -175,16 +215,25 @@ namespace ndd { // 1. Bitmap Size uint32_t bm_size; - std::memcpy(&bm_size, ptr, 4); ptr += 4; - - if (ptr + bm_size > end) { + std::memcpy(&bm_size, ptr, sizeof(uint32_t)); + ptr += sizeof(uint32_t); + if (bm_size > static_cast(end - ptr)) { throw std::runtime_error("Bucket corrupt: invalid bitmap size"); } // 2. Bitmap if (bm_size > 0) { - b.summary_bitmap = ndd::RoaringBitmap::read(reinterpret_cast(ptr)); - ptr += bm_size; + auto bitmap_result = read_bitmap_payload(ptr, bm_size); + if(!bitmap_result.ok()) { + throw std::runtime_error("Bucket corrupt: " + + bitmap_result.message); + } + if(!bitmap_result.value.has_value()) { + throw std::runtime_error( + "Bucket corrupt: bitmap reader succeeded without a bitmap"); + } + b.summary_bitmap = std::move(*bitmap_result.value); + ptr += bm_size; } if (ptr + 2 > end) throw std::runtime_error("Bucket corrupt: truncated count"); @@ -212,13 +261,75 @@ namespace ndd { return b; } - // Fast access to just the bitmap (for middle buckets) - static ndd::RoaringBitmap read_summary_bitmap(const void* data, size_t len) { - const uint8_t* ptr = static_cast(data); - uint32_t bm_size; - std::memcpy(&bm_size, ptr, 4); ptr += 4; - if(bm_size == 0) return ndd::RoaringBitmap(); - return ndd::RoaringBitmap::read(reinterpret_cast(ptr)); + /** + * Fast access to just the bitmap. + * + * Used by range() when a bucket is fully covered by the query + * extent and we don't need the deltas/ids arrays. Skips the + * memcpy + vector allocations that full deserialize would do + * for those arrays. + * + * On the `count` field and why it is intentionally ignored here: + * + * The on-disk bucket layout this function reads is: + * + * [bm_size : uint32_t] + * [bitmap : bm_size bytes] + * [count : uint16_t] <-- not needed by us + * [deltas : count * sizeof(uint16_t)] <-- not read here + * [ids : count * sizeof(idInt)] <-- not read here + * + * `count` exists only so the older full-deserialize path knows how + * many delta/id entries follow the bitmap. That value can be + * recovered without an explicit field by computing + * (record_len - sizeof(uint32_t) - bm_size) + * / (sizeof(uint16_t) + sizeof(idInt)) + * which is exactly how the next major version of the bucket format + * will work -- the `count` field will be dropped to save 2 bytes + * per bucket on disk and to remove the redundancy between the + * stored count and the byte-length-derived count. + * + * For now `count` is preserved in the bucket layout to keep + * backward compatibility with existing on-disk indexes built by + * prior versions: those buckets carry the `count` field, and any + * code path that round-trips a bucket (read + modify + write) + * must continue to honor it. The full `Bucket::deserialize` / + * `Bucket::serialize` pair still reads and writes `count`. + * + * `read_summary_bitmap` only needs the bitmap, so it stops after + * the bitmap bytes and never touches `count` or anything after + * it. Skipping over `count` here is safe for both the current + * (count-bearing) layout and the future (count-less) layout, so + * this code path will continue to work unchanged when `count` is + * removed in a subsequent version. The corresponding alignment + * sanity check on the trailing bytes is intentionally omitted: + * any corruption in the trailing region is caught by the full + * `Bucket::deserialize` path that actually consumes those bytes. + */ + static ndd::RoaringBitmap read_summary_bitmap(const void* data, + size_t len) { + if (len < sizeof(uint32_t)) { + throw std::runtime_error("Bucket corrupt: missing bitmap size"); + } + const uint8_t* ptr = static_cast(data); + const uint8_t* end = ptr + len; + uint32_t bm_size; + std::memcpy(&bm_size, ptr, sizeof(uint32_t)); + ptr += sizeof(uint32_t); + if (bm_size > static_cast(end - ptr)) { + throw std::runtime_error("Bucket corrupt: invalid bitmap size"); + } + if (bm_size == 0) return ndd::RoaringBitmap(); + auto bitmap_result = read_bitmap_payload(ptr, bm_size); + if(!bitmap_result.ok()) { + throw std::runtime_error("Bucket corrupt: " + + bitmap_result.message); + } + if(!bitmap_result.value.has_value()) { + throw std::runtime_error( + "Bucket corrupt: bitmap reader succeeded without a bitmap"); + } + return std::move(*bitmap_result.value); } bool is_full() const { return ids.size() >= MAX_SIZE; } diff --git a/tests/filter_test.cpp b/tests/filter_test.cpp index bb0d023d87..81cf0155ce 100644 --- a/tests/filter_test.cpp +++ b/tests/filter_test.cpp @@ -6,6 +6,7 @@ #include #include #include "filter/filter.hpp" +#include "filter/category_index.hpp" #include "json/nlohmann_json.hpp" #include "filter/numeric_index.hpp" // For Bucket test @@ -87,6 +88,135 @@ TEST_F(FilterTest, CategoryFilterBasics) { EXPECT_EQ(std::find(ids.begin(), ids.end(), 2), ids.end()); } +class CategoryIndexCorruptionTest : public ::testing::Test { +protected: + std::string db_path; + MDBX_env* env = nullptr; + std::unique_ptr category_index; + + void SetUp() override { + db_path = "./category_corrupt_db_" + std::to_string(rand()); + if(fs::exists(db_path)) { + fs::remove_all(db_path); + } + fs::create_directories(db_path); + + int rc = mdbx_env_create(&env); + ASSERT_EQ(rc, MDBX_SUCCESS) << mdbx_strerror(rc); + + rc = mdbx_env_set_maxdbs(env, 10); + ASSERT_EQ(rc, MDBX_SUCCESS) << mdbx_strerror(rc); + + rc = mdbx_env_set_geometry(env, + -1, + 1ULL << settings::FILTER_MAP_SIZE_BITS, + 1ULL << settings::FILTER_MAP_SIZE_MAX_BITS, + 1ULL << settings::FILTER_MAP_SIZE_BITS, + -1, + -1); + ASSERT_EQ(rc, MDBX_SUCCESS) << mdbx_strerror(rc); + + rc = mdbx_env_open(env, + db_path.c_str(), + MDBX_WRITEMAP | MDBX_MAPASYNC | MDBX_NORDAHEAD, + 0664); + ASSERT_EQ(rc, MDBX_SUCCESS) << mdbx_strerror(rc); + + category_index = std::make_unique(env); + } + + void TearDown() override { + category_index.reset(); + if(env != nullptr) { + mdbx_env_close(env); + env = nullptr; + } + if(fs::exists(db_path)) { + fs::remove_all(db_path); + } + } + + void put_raw_payload(const std::string& key_string, std::vector& payload) { + MDBX_txn* txn = nullptr; + int rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_READWRITE, &txn); + ASSERT_EQ(rc, MDBX_SUCCESS) << mdbx_strerror(rc); + + MDBX_val key{const_cast(key_string.data()), key_string.size()}; + MDBX_val data{payload.data(), payload.size()}; + rc = mdbx_put(txn, category_index->get_dbi(), &key, &data, MDBX_UPSERT); + if(rc != MDBX_SUCCESS) { + mdbx_txn_abort(txn); + ASSERT_EQ(rc, MDBX_SUCCESS) << mdbx_strerror(rc); + } + + rc = mdbx_txn_commit(txn); + ASSERT_EQ(rc, MDBX_SUCCESS) << mdbx_strerror(rc); + } +}; + +TEST_F(CategoryIndexCorruptionTest, RejectsTruncatedBitmapPayload) { + ndd::RoaringBitmap bitmap; + bitmap.add(1); + bitmap.add(3); + + std::vector payload(bitmap.getSizeInBytes()); + bitmap.write(payload.data(), true); + ASSERT_GT(payload.size(), 1u); + payload.pop_back(); + + put_raw_payload(ndd::filter::CategoryIndex::make_key("city", "Paris"), payload); + + auto result = category_index->get_bitmap("city", "Paris"); + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.code, 200u); +} + +TEST_F(CategoryIndexCorruptionTest, ReadsValidRawBitmapPayload) { + ndd::RoaringBitmap bitmap; + bitmap.add(11); + bitmap.add(29); + bitmap.runOptimize(); + + std::vector payload(bitmap.getSizeInBytes()); + bitmap.write(payload.data(), true); + + put_raw_payload(ndd::filter::CategoryIndex::make_key("city", "Berlin"), payload); + + auto result = category_index->get_bitmap("city", "Berlin"); + ASSERT_TRUE(result.ok()) << result.message; + ASSERT_TRUE(result.value.has_value()); + EXPECT_TRUE(result.value->contains(11)); + EXPECT_TRUE(result.value->contains(29)); + EXPECT_FALSE(result.value->contains(30)); +} + +TEST_F(CategoryIndexCorruptionTest, RejectsGarbageBitmapPayload) { + std::vector payload{0, 0, 0, 0, 1, 2, 3, 4}; + + put_raw_payload(ndd::filter::CategoryIndex::make_key("city", "Rome"), payload); + + auto result = category_index->get_bitmap("city", "Rome"); + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.code, 200u); + EXPECT_NE(result.message.find("invalid or truncated bitmap payload"), + std::string::npos); +} + +TEST_F(CategoryIndexCorruptionTest, RejectsTrailingBytesAfterBitmapPayload) { + ndd::RoaringBitmap bitmap; + bitmap.add(5); + + std::vector payload(bitmap.getSizeInBytes()); + bitmap.write(payload.data(), true); + payload.push_back('\0'); + + put_raw_payload(ndd::filter::CategoryIndex::make_key("city", "London"), payload); + + auto result = category_index->get_bitmap("city", "London"); + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.code, 200u); +} + TEST_F(FilterTest, BooleanFilterBasics) { // Boolean is just a special category "0" or "1" // ID 10: Active=true @@ -467,3 +597,660 @@ TEST_F(FilterTest, ComparisonEmptyRangeAtIntegerBoundary) { EXPECT_TRUE(r2.ok()) << r2.message; EXPECT_EQ(r2.value_or_throw().size(), 0u); } + +// ================================================================= +// Hypothesis tests for the dirty numeric_index.hpp range() perf work. +// +// These tests do NOT modify production code; they probe internal +// invariants of Bucket and end-to-end behavior of Filter to confirm +// or refute the claims made about the post_filter_new regression. +// +// Naming convention: HypothesisN_* where N matches the analysis. +// A failing assertion in these tests means the corresponding +// hypothesis is correct (the claimed unwanted behavior is observable). +// ================================================================= + +// --- Hypothesis 1 ---------------------------------------------------- +// Claim: For VectorDBBench-style packed int data, bucket values are +// densely packed in a much narrower extent than 65536, so the new +// "Coarse full-coverage fast path" predicate +// bucket_base >= min_val +// && bucket_base + Bucket::MAX_DELTA <= max_val +// is FALSE on the typical bucket -- even when the OLD post-deserialize +// predicate (bucket_min >= min_val && bucket_max <= max_val) is TRUE. +// Implication: the fast path does not actually fire on the workload +// it was meant to optimize, and we keep paying the deserialize cost. +// +// PASS = both predicates evaluated below match the hypothesis values. +// FAIL = the predicates disagree with the hypothesis (analysis is wrong). +TEST(Hypothesis1, FastPathPredicateMissesPackedBucket) { + constexpr uint32_t base = 0x80000000u; // sortable encoding of int 0 + constexpr size_t N = ndd::filter::Bucket::MAX_SIZE; + constexpr uint32_t spread = 1023; // values densely packed in [base, base+1023] + + ndd::filter::Bucket bucket; + bucket.base_value = base; + for (size_t i = 0; i < N; ++i) { + bucket.add(base + static_cast(i % (spread + 1)), + static_cast(i + 1)); + } + ASSERT_EQ(bucket.ids.size(), N); + + const uint32_t bucket_min = bucket.get_value(0); + const uint32_t bucket_max = bucket.get_value(bucket.ids.size() - 1); + EXPECT_EQ(bucket_min, base); + EXPECT_EQ(bucket_max, base + spread); + + // Query covers exactly the bucket's actual extent. + const uint32_t min_val = bucket_min; + const uint32_t max_val = bucket_max; + + const bool old_full_overlap = bucket_min >= min_val && bucket_max <= max_val; + const bool new_fast_path = + bucket.base_value >= min_val + && static_cast(bucket.base_value) + + ndd::filter::Bucket::MAX_DELTA <= max_val; + + EXPECT_TRUE(old_full_overlap) + << "OLD code's full-overlap branch would fire on this bucket"; + EXPECT_FALSE(new_fast_path) + << "NEW fast path requires the entire 65536-wide extent to fit " + "inside [min,max], so it MISSES on packed buckets"; +} + +// Counter-test: when bucket values DO span the full delta range and +// the query is wide enough, the new fast path predicate is TRUE. +TEST(Hypothesis1, FastPathFiresOnWidelySpreadBucket) { + constexpr uint32_t base = 100'000; + ndd::filter::Bucket bucket; + bucket.base_value = base; + for (size_t i = 0; i < 1024; ++i) { + const uint32_t val = base + + static_cast( + (i * static_cast(ndd::filter::Bucket::MAX_DELTA)) + / 1023); + bucket.add(val, static_cast(i + 1)); + } + + const uint32_t min_val = base; + const uint32_t max_val = base + ndd::filter::Bucket::MAX_DELTA; + const bool new_fast_path = + bucket.base_value >= min_val + && static_cast(bucket.base_value) + + ndd::filter::Bucket::MAX_DELTA <= max_val; + EXPECT_TRUE(new_fast_path); +} + +// --- Hypothesis 2 ---------------------------------------------------- +// Claim: After bucket saturation with duplicates (the dirty Bucket::add +// caps deltas/ids at MAX_SIZE for delta_32 == 0 inserts but keeps +// adding to summary_bitmap), the bucket has cardinality > ids.size, +// and the new bitmap-only-inclusion branch in range() returns ids +// that the OLD code would never have surfaced. +TEST(Hypothesis2, SaturationCreatesBitmapOnlyEntries) { + constexpr uint32_t base = 0; + constexpr ndd::idInt N_TOTAL = ndd::filter::Bucket::MAX_SIZE + 500; + + ndd::filter::Bucket bucket; + bucket.base_value = base; + for (ndd::idInt i = 1; i <= N_TOTAL; ++i) { + bucket.add(base, i); // all duplicates of base_value + } + + EXPECT_EQ(bucket.ids.size(), ndd::filter::Bucket::MAX_SIZE); + EXPECT_EQ(bucket.summary_bitmap.cardinality(), N_TOTAL); + EXPECT_GT(bucket.summary_bitmap.cardinality(), bucket.ids.size()) + << "bitmap-only branch in range() will fire iff cardinality > ids.size"; +} + +// End-to-end check through the Filter API: when we insert MAX_SIZE+K +// rows that all share a numeric value, an $eq query should return all +// MAX_SIZE+K ids. If saturation drops K of them, this test fails -- but +// then the recall bump observed in the chart cannot be explained by +// this branch and we should look elsewhere. +TEST_F(FilterTest, Hypothesis2_RangeReturnsAllSaturatedDuplicates) { + constexpr int VALUE = 42; + constexpr ndd::idInt EXTRA = 500; + constexpr ndd::idInt N = ndd::filter::Bucket::MAX_SIZE + EXTRA; + + const std::string filter_payload = + std::string(R"({"score": )") + std::to_string(VALUE) + "}"; + for (ndd::idInt i = 1; i <= N; ++i) { + expect_ok(filter->add_filters_from_json(i, filter_payload)); + } + + json query = json::array({{ {"score", {{"$eq", VALUE}}} }}); + auto ids = unwrap_ok(filter->getIdsMatchingFilter(query)); + EXPECT_EQ(ids.size(), N) + << "If saturation logic is dropping ids, recall would actually go DOWN, " + "not up, contradicting the chart."; +} + +// --- Hypothesis 3 ---------------------------------------------------- +// Claim: When a slide-split fires on a saturated bucket, the LEFT +// bucket's summary_bitmap is rebuilt from `ids` only (see +// add_to_buckets at numeric_index.hpp:614-617): +// bucket.summary_bitmap = ndd::RoaringBitmap(); +// for (auto bucket_id : bucket.ids) bucket.summary_bitmap.add(bucket_id); +// Any bitmap-only entries (excess saturated duplicates) that lived on +// the LEFT side of the split are silently dropped. +// +// We reproduce the rebuild step inline because the slide-split lives +// inside NumericIndex::add_to_buckets (a private path with no test +// hook). If H3 holds, the data loss is observable on the local Bucket. +TEST(Hypothesis3, SlideSplitRebuildLosesBitmapOnlyEntries) { + constexpr uint32_t base = 0; + ndd::filter::Bucket bucket; + bucket.base_value = base; + + // Fill with MAX_SIZE unique-delta entries so a real split is possible. + for (uint32_t v = 0; v < ndd::filter::Bucket::MAX_SIZE; ++v) { + bucket.add(v, static_cast(v + 1)); + } + ASSERT_EQ(bucket.ids.size(), ndd::filter::Bucket::MAX_SIZE); + + // Simulate the saturated-duplicate path: bitmap gains an id but + // ids/deltas do not (because Bucket::add returns early for + // delta_32 == 0 once ids.size() >= MAX_SIZE). + constexpr ndd::idInt BITMAP_ONLY_ID_A = 100'000; + constexpr ndd::idInt BITMAP_ONLY_ID_B = 100'001; + bucket.summary_bitmap.add(BITMAP_ONLY_ID_A); + bucket.summary_bitmap.add(BITMAP_ONLY_ID_B); + ASSERT_EQ(bucket.summary_bitmap.cardinality(), bucket.ids.size() + 2); + + // Reproduce the slide-split LEFT-side rebuild. + const size_t mid_idx = bucket.ids.size() / 2; + bucket.deltas.resize(mid_idx); + bucket.ids.resize(mid_idx); + bucket.summary_bitmap = ndd::RoaringBitmap(); + for (auto id : bucket.ids) { + bucket.summary_bitmap.add(id); + } + + EXPECT_FALSE(bucket.summary_bitmap.contains(BITMAP_ONLY_ID_A)); + EXPECT_FALSE(bucket.summary_bitmap.contains(BITMAP_ONLY_ID_B)); + EXPECT_EQ(bucket.summary_bitmap.cardinality(), bucket.ids.size()); +} + +// --- Hypothesis 4 ---------------------------------------------------- +// Claim: accepting the OLD on-disk format (legacy uint16_t count +// between bitmap and arrays) recovers cliff-corrupted bitmap ids and +// can grow the range result candidate set. The production reader now +// rejects that payload shape instead of trying to salvage it. +TEST(Hypothesis4, DeserializeRejectsLegacyCountFormat) { + // Manually craft an OLD-format payload: + // [u32 bm_size] [bitmap bytes] [u16 count=0] + // i.e. cliff-truncated count, but bitmap retained the lost ids. + + constexpr ndd::idInt LOST_ID_A = 7; + constexpr ndd::idInt LOST_ID_B = 9; + ndd::RoaringBitmap original; + original.add(LOST_ID_A); + original.add(LOST_ID_B); + original.runOptimize(); + + const size_t bm_size = original.getSizeInBytes(); + std::vector buffer(sizeof(uint32_t) + bm_size + sizeof(uint16_t), 0); + uint8_t* ptr = buffer.data(); + + const uint32_t bm_size_32 = static_cast(bm_size); + std::memcpy(ptr, &bm_size_32, sizeof(uint32_t)); + ptr += sizeof(uint32_t); + original.write(reinterpret_cast(ptr)); + ptr += bm_size; + const uint16_t legacy_count = 0; + std::memcpy(ptr, &legacy_count, sizeof(uint16_t)); + + EXPECT_THROW( + (void)ndd::filter::Bucket::deserialize( + buffer.data(), buffer.size(), /*base_val=*/100), + std::runtime_error); +} + +// Companion check on the read_summary_bitmap fast-path helper: it must +// reject the same legacy-format payloads as the full deserializer, so +// the fast path cannot silently reintroduce compatibility. +TEST(Hypothesis4, ReadSummaryBitmapRejectsLegacyCountFormat) { + ndd::RoaringBitmap original; + for (ndd::idInt i = 0; i < 50; ++i) original.add(i * 3); + original.runOptimize(); + + const size_t bm_size = original.getSizeInBytes(); + std::vector buffer(sizeof(uint32_t) + bm_size + sizeof(uint16_t), 0); + uint8_t* ptr = buffer.data(); + const uint32_t bm_size_32 = static_cast(bm_size); + std::memcpy(ptr, &bm_size_32, sizeof(uint32_t)); + ptr += sizeof(uint32_t); + original.write(reinterpret_cast(ptr)); + ptr += bm_size; + const uint16_t legacy_count = 0; + std::memcpy(ptr, &legacy_count, sizeof(uint16_t)); + + EXPECT_THROW( + (void)ndd::filter::Bucket::deserialize(buffer.data(), buffer.size(), 0), + std::runtime_error); + EXPECT_THROW( + (void)ndd::filter::Bucket::read_summary_bitmap( + buffer.data(), buffer.size()), + std::runtime_error); +} + +TEST(NumericBucketCorruptionTest, RejectsExtraBytesInsideDeclaredBitmapPayload) { + ndd::RoaringBitmap original; + for(ndd::idInt i = 0; i < 50; ++i) { + original.add(i * 5); + } + original.runOptimize(); + + const size_t bm_size = original.getSizeInBytes(); + const uint32_t declared_bm_size = static_cast(bm_size + 1); + std::vector buffer(sizeof(uint32_t) + declared_bm_size, 0); + uint8_t* ptr = buffer.data(); + + std::memcpy(ptr, &declared_bm_size, sizeof(uint32_t)); + ptr += sizeof(uint32_t); + original.write(reinterpret_cast(ptr)); + + EXPECT_THROW( + (void)ndd::filter::Bucket::deserialize(buffer.data(), buffer.size(), 0), + std::runtime_error); + EXPECT_THROW( + (void)ndd::filter::Bucket::read_summary_bitmap( + buffer.data(), buffer.size()), + std::runtime_error); +} + +TEST(NumericBucketCorruptionTest, ReadBitmapPayloadReturnsOperationResultOnSuccess) { + ndd::RoaringBitmap original; + original.add(101); + original.add(202); + original.runOptimize(); + + std::vector payload(original.getSizeInBytes()); + original.write(reinterpret_cast(payload.data())); + + auto result = ndd::filter::Bucket::read_bitmap_payload(payload.data(), + payload.size()); + + ASSERT_TRUE(result.ok()) << result.message; + ASSERT_TRUE(result.value.has_value()); + EXPECT_TRUE(result.value->contains(101)); + EXPECT_TRUE(result.value->contains(202)); + EXPECT_FALSE(result.value->contains(303)); +} + +TEST(NumericBucketCorruptionTest, ReadBitmapPayloadRejectsGarbageWithoutThrowing) { + std::vector payload{0, 0, 0, 0, 7, 8, 9, 10}; + + auto result = ndd::filter::Bucket::read_bitmap_payload(payload.data(), + payload.size()); + + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.code, 200u); + EXPECT_FALSE(result.value.has_value()); + EXPECT_NE(result.message.find("invalid or truncated bitmap payload"), + std::string::npos); +} + +TEST(NumericBucketCorruptionTest, DeserializesValidBucketAfterPayloadValidation) { + ndd::filter::Bucket bucket; + bucket.base_value = 1000; + bucket.add(1000, 42); + bucket.add(1007, 43); + + auto bytes = bucket.serialize(); + auto decoded = ndd::filter::Bucket::deserialize(bytes.data(), + bytes.size(), + bucket.base_value); + auto bitmap_only = ndd::filter::Bucket::read_summary_bitmap(bytes.data(), + bytes.size()); + + EXPECT_EQ(decoded.base_value, bucket.base_value); + EXPECT_EQ(decoded.ids.size(), 2u); + EXPECT_TRUE(decoded.summary_bitmap.contains(42)); + EXPECT_TRUE(decoded.summary_bitmap.contains(43)); + EXPECT_TRUE(bitmap_only.contains(42)); + EXPECT_TRUE(bitmap_only.contains(43)); +} + +TEST(NumericBucketCorruptionTest, RejectsGarbageInsideDeclaredBitmapPayload) { + const uint32_t declared_bm_size = 8; + std::vector buffer(sizeof(uint32_t) + declared_bm_size, 0); + std::memcpy(buffer.data(), &declared_bm_size, sizeof(uint32_t)); + + EXPECT_THROW( + (void)ndd::filter::Bucket::deserialize(buffer.data(), buffer.size(), 0), + std::runtime_error); + EXPECT_THROW( + (void)ndd::filter::Bucket::read_summary_bitmap( + buffer.data(), buffer.size()), + std::runtime_error); +} + +// End-to-end recall check through the Filter API: insert N records +// with a wide spread of numeric values, run a wide range query, and +// compare the returned id set against a brute-force enumeration of +// the same JSON payload. If H4 is the regression cause, the chart's +// recall bump corresponds to results that match brute force more +// closely on the dirty branch -- but on a freshly built DB (no cliff +// state) this test must pass exactly. Mismatch here would mean the +// dirty range() over-includes even on clean data, which would shift +// the diagnosis. +TEST_F(FilterTest, Hypothesis4_RangeMatchesBruteForceOnCleanDb) { + constexpr ndd::idInt N = 5000; + // Spread values across more than one bucket extent (MAX_DELTA = 65535) + // so we exercise both the fast path and the per-bucket scan. + auto value_for = [](ndd::idInt i) -> int { + return static_cast((i * 37) % 200000); + }; + + for (ndd::idInt i = 1; i <= N; ++i) { + const std::string payload = + std::string(R"({"score": )") + std::to_string(value_for(i)) + "}"; + expect_ok(filter->add_filters_from_json(i, payload)); + } + + constexpr int LO = 50000; + constexpr int HI = 120000; + json query = json::array({ + {{"score", {{"$range", json::array({LO, HI})}}}} + }); + auto got = unwrap_ok(filter->getIdsMatchingFilter(query)); + std::sort(got.begin(), got.end()); + + std::vector expected; + for (ndd::idInt i = 1; i <= N; ++i) { + const int v = value_for(i); + if (v >= LO && v <= HI) expected.push_back(i); + } + std::sort(expected.begin(), expected.end()); + + EXPECT_EQ(got, expected); +} + +// ===================================================================== +// NumericRangeBench: targeted microbench against an EXISTING filter MDBX +// directory. Runs Filter::computeFilterBitmap (which calls +// NumericIndex::range) repeatedly for a few canned filter_rates and +// prints per-call wall time. Compare two builds (dirty vs stashed) +// against the SAME db path, with no concurrency, no HNSW, no HTTP. +// +// Activation: set ENDEE_BENCH_DB to a directory containing mdbx.dat. +// Optional: ENDEE_BENCH_FIELD (default "id"), ENDEE_BENCH_ITERS (default 200). +// +// Caveat: Filter::init_environment opens the env with MDBX_WRITEMAP, so +// no other process may hold the DB while the bench runs (stop the +// endee server first). The bench itself only issues read queries. +// ===================================================================== +namespace { +struct BenchPoint { + const char* label; + int lo; + int hi; +}; + +void run_bench_point(Filter& filter, + const std::string& field, + const BenchPoint& pt, + int iters) { + json query = json::array({ + {{field, {{"$range", json::array({pt.lo, pt.hi})}}}} + }); + + // Warmup -- prime page cache, schema cache, allocator state. + for (int i = 0; i < 3; ++i) { + auto r = filter.computeFilterBitmap(query); + ASSERT_TRUE(r.ok()) << r.message; + } + + size_t result_card = 0; + auto t0 = std::chrono::steady_clock::now(); + for (int i = 0; i < iters; ++i) { + auto r = filter.computeFilterBitmap(query); + ASSERT_TRUE(r.ok()) << r.message; + result_card = r.value_or_throw().cardinality(); + } + auto t1 = std::chrono::steady_clock::now(); + + const double total_ms = + std::chrono::duration(t1 - t0).count(); + const double per_call_ms = total_ms / iters; + + std::printf(" %-12s [% 8d,% 8d] iters=%d per_call=%.3f ms card=%zu\n", + pt.label, pt.lo, pt.hi, iters, per_call_ms, result_card); +} +} // namespace + +// Dumps internal structure of the bitmap that range() returns. Compare +// the output between clean and dirty builds to see whether the dirty +// path is producing a structurally different (and possibly slower to +// query) bitmap. Also bench-times a tight contains() loop on that +// bitmap to mirror what BitMapFilterFunctor does inside HNSW search. +TEST(NumericRangeBench, BitmapStructureAndContainsCost) { + const char* db_path = std::getenv("ENDEE_BENCH_DB"); + if (!db_path || !*db_path) GTEST_SKIP() << "Set ENDEE_BENCH_DB"; + const char* field_env = std::getenv("ENDEE_BENCH_FIELD"); + const std::string field = (field_env && *field_env) ? field_env : "id"; + + Filter filter(db_path); + + struct Point { const char* label; long long lo; long long hi; }; + const Point points[] = { + {"rate~0.99", 0, 9'900'000}, + {"rate~0.80", 0, 8'000'000}, + {"rate~0.50", 0, 5'000'000}, + {"rate~0.01", 0, 100'000}, + }; + + for (const auto& p : points) { + json q = json::array({{ {field, {{"$range", json::array({p.lo, p.hi})}}} }}); + auto r = filter.computeFilterBitmap(q); + ASSERT_TRUE(r.ok()) << r.message; + auto& bm = r.value_or_throw(); + const uint64_t card = bm.cardinality(); + + // Force-serialize to see the structural cost of the bitmap. + // The size after runOptimize is the most honest "structural cost" + // because OLD writes always runOptimize before persisting. + bm.runOptimize(); + const size_t opt_bytes = bm.getSizeInBytes(); + // Probe contains() cost on a fixed, stride-based set of ids inside + // the range. 1M lookups -- about the same order as HNSW filtered + // search visit count at moderate ef. + constexpr int N_PROBES = 1'000'000; + const long long stride = std::max(1, (p.hi - p.lo) / N_PROBES); + volatile uint64_t sink = 0; + auto t0 = std::chrono::steady_clock::now(); + for (long long v = p.lo; v < p.hi && v < p.lo + (long long)N_PROBES * stride; v += stride) { + sink += bm.contains(static_cast(v)) ? 1 : 0; + } + auto t1 = std::chrono::steady_clock::now(); + const double total_us = + std::chrono::duration(t1 - t0).count(); + const long long probes_done = (p.hi - p.lo) / stride; + + std::printf(" %-10s card=%llu bytes_after_runOpt=%zu " + "contains(%lld probes)=%.1f us (%.1f ns/probe, hits=%llu)\n", + p.label, (unsigned long long)card, opt_bytes, + probes_done, total_us, + total_us * 1000.0 / std::max(1, probes_done), + (unsigned long long)sink); + } +} + +TEST(NumericRangeBench, ProbeValueDistribution) { + const char* db_path = std::getenv("ENDEE_BENCH_DB"); + if (!db_path || !*db_path) GTEST_SKIP() << "Set ENDEE_BENCH_DB"; + const char* field_env = std::getenv("ENDEE_BENCH_FIELD"); + const std::string field = (field_env && *field_env) ? field_env : "id"; + Filter f(db_path); + auto probe = [&](long long lo, long long hi) { + json q = json::array({{ {field, {{"$range", json::array({lo, hi})}}} }}); + auto r = f.computeFilterBitmap(q); + ASSERT_TRUE(r.ok()) << r.message; + std::printf(" range[% 12lld, % 12lld] card=%llu\n", + lo, hi, (unsigned long long)r.value_or_throw().cardinality()); + }; + probe(-2147483647LL, 2147483647LL); + probe(0, 10000000); + probe(0, 5000000); + probe(2500000, 7500000); + probe(-32768, 32767); + probe(0, 100000); +} + +TEST(NumericRangeBench, RangeQueryWallClock) { + const char* db_path = std::getenv("ENDEE_BENCH_DB"); + if (!db_path || !*db_path) { + GTEST_SKIP() << "Set ENDEE_BENCH_DB to a filter directory to run"; + } + const char* field_env = std::getenv("ENDEE_BENCH_FIELD"); + const std::string field = (field_env && *field_env) ? field_env : "id"; + const char* iters_env = std::getenv("ENDEE_BENCH_ITERS"); + const int iters = (iters_env && *iters_env) ? std::atoi(iters_env) : 200; + ASSERT_GT(iters, 0); + + std::printf("NumericRangeBench: db=%s field=%s iters=%d\n", + db_path, field.c_str(), iters); + + Filter filter(db_path); + + // Chart-aligned filter_rate buckets. The benchmark DB has uint32 + // values in [0, 10_000_000] with exactly one id per value (probed + // via ProbeValueDistribution), so filter_rate ~= (hi - lo) / 1e7. + const BenchPoint points[] = { + {"rate~0.99", 0, 9'900'000}, + {"rate~0.80", 0, 8'000'000}, + {"rate~0.50", 0, 5'000'000}, + {"rate~0.01", 0, 100'000}, + }; + + for (const auto& pt : points) { + run_bench_point(filter, field, pt, iters); + } +} + +// ===================================================================== +// NumericRangeBench_MT: same as above, but with N threads hammering +// range() concurrently against ONE shared Filter. Each thread issues +// computeFilterBitmap in a tight loop for a fixed wall-clock window. +// +// What this tells us: if dirty range() regresses here vs the clean +// build but the single-threaded NumericRangeBench above does not, +// the cost is concurrency-related (heap / allocator / cache-line +// contention triggered by the dirty in-memory layout), not per-call +// algorithmic. +// +// Activation: same env vars as the single-threaded bench, plus: +// ENDEE_BENCH_THREADS (default 16) +// ENDEE_BENCH_SECONDS (default 8) +// ===================================================================== +namespace { +struct MtResult { + uint64_t total_ops = 0; + uint64_t result_card_sample = 0; +}; + +void run_bench_point_mt(Filter& filter, + const std::string& field, + const BenchPoint& pt, + int threads, + double seconds) { + json query = json::array({ + {{field, {{"$range", json::array({pt.lo, pt.hi})}}}} + }); + + // Warmup serially -- prime page cache + schema cache. + for (int i = 0; i < 3; ++i) { + auto r = filter.computeFilterBitmap(query); + ASSERT_TRUE(r.ok()) << r.message; + } + + std::atomic start{false}; + std::atomic stop{false}; + std::vector per_thread(threads); + + auto worker = [&](int tid) { + while (!start.load(std::memory_order_acquire)) { + std::this_thread::yield(); + } + uint64_t ops = 0; + uint64_t card_sample = 0; + while (!stop.load(std::memory_order_acquire)) { + auto r = filter.computeFilterBitmap(query); + if (!r.ok()) { + std::fprintf(stderr, "thread %d: %s\n", tid, r.message.c_str()); + return; + } + if ((ops & 0xFFF) == 0) { + card_sample = r.value_or_throw().cardinality(); + } + ++ops; + } + per_thread[tid].total_ops = ops; + per_thread[tid].result_card_sample = card_sample; + }; + + std::vector ts; + ts.reserve(threads); + for (int i = 0; i < threads; ++i) ts.emplace_back(worker, i); + + auto t0 = std::chrono::steady_clock::now(); + start.store(true, std::memory_order_release); + + std::this_thread::sleep_for(std::chrono::duration(seconds)); + stop.store(true, std::memory_order_release); + + for (auto& t : ts) t.join(); + auto t1 = std::chrono::steady_clock::now(); + + uint64_t total_ops = 0; + uint64_t card = 0; + for (const auto& r : per_thread) { + total_ops += r.total_ops; + if (r.result_card_sample) card = r.result_card_sample; + } + const double elapsed_s = + std::chrono::duration(t1 - t0).count(); + const double qps = total_ops / elapsed_s; + const double per_call_ms = (elapsed_s * 1000.0 * threads) / total_ops; + + std::printf(" %-12s [% 8d,% 8d] threads=%d ops=%llu qps=%.1f " + "per_call_avg=%.3f ms card=%llu\n", + pt.label, pt.lo, pt.hi, threads, + (unsigned long long)total_ops, qps, per_call_ms, + (unsigned long long)card); +} +} // namespace + +TEST(NumericRangeBench, RangeQueryMultiThreaded) { + const char* db_path = std::getenv("ENDEE_BENCH_DB"); + if (!db_path || !*db_path) { + GTEST_SKIP() << "Set ENDEE_BENCH_DB to a filter directory to run"; + } + const char* field_env = std::getenv("ENDEE_BENCH_FIELD"); + const std::string field = (field_env && *field_env) ? field_env : "id"; + const char* threads_env = std::getenv("ENDEE_BENCH_THREADS"); + const int threads = (threads_env && *threads_env) ? std::atoi(threads_env) : 16; + const char* seconds_env = std::getenv("ENDEE_BENCH_SECONDS"); + const double seconds = + (seconds_env && *seconds_env) ? std::atof(seconds_env) : 8.0; + ASSERT_GT(threads, 0); + ASSERT_GT(seconds, 0.0); + + std::printf("NumericRangeBench_MT: db=%s field=%s threads=%d seconds=%.1f\n", + db_path, field.c_str(), threads, seconds); + + Filter filter(db_path); + + const BenchPoint points[] = { + {"rate~0.99", 0, 9'900'000}, + {"rate~0.80", 0, 8'000'000}, + {"rate~0.50", 0, 5'000'000}, + {"rate~0.01", 0, 100'000}, + }; + + for (const auto& pt : points) { + run_bench_point_mt(filter, field, pt, threads, seconds); + } +} From 861a0c57f9e15ee20bc56f8489060689c3513ad7 Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Tue, 12 May 2026 09:50:54 +0000 Subject: [PATCH 19/28] filter parameters validation --- src/main.cpp | 36 ++++++++++++++----- src/server/request_validation.hpp | 59 +++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 8 deletions(-) create mode 100644 src/server/request_validation.hpp diff --git a/src/main.cpp b/src/main.cpp index 1fe4bab119..415cefe6fe 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -37,6 +37,7 @@ #include "core/ndd.hpp" #include "auth.hpp" #include "quant/common.hpp" +#include "server/request_validation.hpp" #include "system_sanity/system_sanity.hpp" using ndd::quant::quantLevelToString; @@ -838,14 +839,33 @@ int main(int argc, char** argv) { // Extract filter parameters (Option B from chat plan) ndd::FilterParams filter_params; - if (body.has("filter_params")) { - auto fp = body["filter_params"]; - if (fp.has("prefilter_threshold")) { - filter_params.prefilter_threshold = static_cast(fp["prefilter_threshold"].i()); - } - if (fp.has("boost_percentage")) { - filter_params.boost_percentage = static_cast(fp["boost_percentage"].i()); - } + if(body.has("filter_params")) { + auto fp = body["filter_params"]; + if(fp.t() != crow::json::type::Object) { + return json_error(400, "filter_params must be an object"); + } + if(fp.has("prefilter_threshold")) { + auto prefilter_threshold = ndd::server::parse_bounded_size( + fp["prefilter_threshold"], + "filter_params.prefilter_threshold", + 0, + settings::MAX_VECTORS_ADMIN); + if(!prefilter_threshold.ok()) { + return json_error(400, prefilter_threshold.message); + } + filter_params.prefilter_threshold = prefilter_threshold.value_or_throw(); + } + if(fp.has("boost_percentage")) { + auto boost_percentage = + ndd::server::parse_bounded_size(fp["boost_percentage"], + "filter_params.boost_percentage", + 0, + 100); + if(!boost_percentage.ok()) { + return json_error(400, boost_percentage.message); + } + filter_params.boost_percentage = boost_percentage.value_or_throw(); + } } float dense_rrf_weight = body.has("dense_rrf_weight") ? (float)body["dense_rrf_weight"].d() : settings::DEFAULT_DENSE_RRF_WEIGHT; diff --git a/src/server/request_validation.hpp b/src/server/request_validation.hpp new file mode 100644 index 0000000000..c8838b31d6 --- /dev/null +++ b/src/server/request_validation.hpp @@ -0,0 +1,59 @@ +#pragma once + +#include +#include +#include +#include + +#include "crow/json.h" +#include "utils/types.hpp" + +namespace ndd::server { + +inline std::string bounded_size_error(const std::string& field_name, size_t min_value, size_t max_value) { + return field_name + " must be between " + std::to_string(min_value) + " and " + + std::to_string(max_value); +} + +inline ndd::OperationResult +parse_bounded_size(const crow::json::rvalue& value, + const std::string& field_name, + size_t min_value, + size_t max_value) { + try { + if(value.t() != crow::json::type::Number) { + return {1, field_name + " must be an integer"}; + } + + const auto number_type = value.nt(); + if(number_type == crow::json::num_type::Floating_point + || number_type == crow::json::num_type::Double_precision_floating_point) { + return {1, field_name + " must be an integer"}; + } + + size_t parsed_value = 0; + if(number_type == crow::json::num_type::Unsigned_integer) { + const uint64_t unsigned_value = value.u(); + if(unsigned_value > static_cast(max_value)) { + return {1, bounded_size_error(field_name, min_value, max_value)}; + } + parsed_value = static_cast(unsigned_value); + } else { + const int64_t signed_value = value.i(); + if(signed_value < 0) { + return {1, bounded_size_error(field_name, min_value, max_value)}; + } + parsed_value = static_cast(signed_value); + } + + if(parsed_value < min_value || parsed_value > max_value) { + return {1, bounded_size_error(field_name, min_value, max_value)}; + } + + return {SUCCESS, "", parsed_value}; + } catch(const std::exception&) { + return {1, field_name + " must be an integer"}; + } +} + +} // namespace ndd::server From f1cd4f5284767ea670682f2764e3bbb662b6fb19 Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Wed, 13 May 2026 01:55:00 +0000 Subject: [PATCH 20/28] bounding the filter mdbx size by reducing the number of updates within a txn --- src/filter/numeric_index.hpp | 40 +++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/src/filter/numeric_index.hpp b/src/filter/numeric_index.hpp index a2c52a0011..84a43287a6 100644 --- a/src/filter/numeric_index.hpp +++ b/src/filter/numeric_index.hpp @@ -341,6 +341,7 @@ namespace ndd { MDBX_env* env_; MDBX_dbi forward_dbi_; // ID -> Value (Field:ID -> Value) MDBX_dbi inverted_dbi_; // BucketKey -> BucketBlob + static constexpr size_t BATCH_TXN_CHUNK_SIZE = 256; std::string make_forward_key(const std::string& field, ndd::idInt id) { return field + ":" + std::to_string(id); @@ -798,7 +799,7 @@ namespace ndd { } /* - * Writes a batch of numeric filter entries in one MDBX write transaction. + * Writes a batch of numeric filter entries in bounded MDBX write transaction chunks. * * Return codes: * 0 = success @@ -811,25 +812,30 @@ namespace ndd { return {SUCCESS, ""}; } - MDBX_txn* txn = nullptr; - int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); - if(rc != MDBX_SUCCESS) { - return {100, "Failed to begin numeric batch write transaction: " - + std::string(mdbx_strerror(rc))}; - } + for(size_t start = 0; start < entries.size(); start += BATCH_TXN_CHUNK_SIZE) { + size_t end = std::min(start + BATCH_TXN_CHUNK_SIZE, entries.size()); - for(const auto& entry : entries) { - auto put_result = put_internal(txn, entry.field, entry.id, entry.value); - if(!put_result.ok()) { - mdbx_txn_abort(txn); - return put_result; + MDBX_txn* txn = nullptr; + int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to begin numeric batch write transaction: " + + std::string(mdbx_strerror(rc))}; } - } - rc = mdbx_txn_commit(txn); - if(rc != MDBX_SUCCESS) { - return {100, "Failed to commit numeric batch write transaction: " - + std::string(mdbx_strerror(rc))}; + for(size_t i = start; i < end; ++i) { + const auto& entry = entries[i]; + auto put_result = put_internal(txn, entry.field, entry.id, entry.value); + if(!put_result.ok()) { + mdbx_txn_abort(txn); + return put_result; + } + } + + rc = mdbx_txn_commit(txn); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to commit numeric batch write transaction: " + + std::string(mdbx_strerror(rc))}; + } } return {SUCCESS, ""}; } From 5edc082513443482ba37df5e697bf66f51fa0922 Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Wed, 13 May 2026 03:23:39 +0000 Subject: [PATCH 21/28] removing search timing for testing --- src/core/ndd.hpp | 14 ---- src/main.cpp | 1 - src/utils/search_timing.hpp | 142 ------------------------------------ 3 files changed, 157 deletions(-) delete mode 100644 src/utils/search_timing.hpp diff --git a/src/core/ndd.hpp b/src/core/ndd.hpp index f201e72cfd..d6e52d42a8 100644 --- a/src/core/ndd.hpp +++ b/src/core/ndd.hpp @@ -13,7 +13,6 @@ #include "msgpack_ndd.hpp" #include "quant_vector.hpp" #include "wal.hpp" -#include "../utils/search_timing.hpp" #include "../quant/dispatch.hpp" #include #include @@ -1597,7 +1596,6 @@ class IndexManager { float kDenseRrfWeight = settings::DEFAULT_DENSE_RRF_WEIGHT, float kRrfRankConstant = settings::DEFAULT_RRF_RANK_CONSTANT) { - ndd::ScopedSearchTiming search_total_timer(ndd::searchTimingStats().search_total); const float kSparseRrfWeight = 1.0f - kDenseRrfWeight; try { auto entry_ptr = getIndexEntry(index_id); @@ -1625,8 +1623,6 @@ class IndexManager { // 0. Compute Filter Bitmap (Shared) std::optional active_filter_bitmap; if (!filter_array.empty()) { - ndd::ScopedSearchTiming filter_bitmap_timer( - ndd::searchTimingStats().filter_bitmap_compute); auto filter_result = entry.vector_storage->filter_store_->computeFilterBitmap(filter_array); if(!filter_result.ok()) { @@ -1696,15 +1692,9 @@ class IndexManager { if (card == 0) { // No results match filter } else if (card < params.prefilter_threshold) { - ndd::ScopedSearchTiming prefilter_total_timer( - ndd::searchTimingStats().prefilter_total); - ndd::recordPrefilterCardinality(card); - // Strategy A: Brute Force on Small Subset std::vector valid_ids; { - ndd::ScopedSearchTiming bitmap_to_ids_timer( - ndd::searchTimingStats().prefilter_bitmap_to_ids); valid_ids.reserve(card); bitmap.iterate( [](ndd::idInt id, void* ptr) { @@ -1715,8 +1705,6 @@ class IndexManager { } { - ndd::ScopedSearchTiming direct_score_timer( - ndd::searchTimingStats().prefilter_direct_mdbx_score); auto distance_func = space->get_dist_func(); void* dist_func_param = space->get_dist_func_param(); std::priority_queue> top_results; @@ -1748,8 +1736,6 @@ class IndexManager { } else { // Strategy B: Filtered HNSW Search - ndd::ScopedSearchTiming filtered_hnsw_timer( - ndd::searchTimingStats().filtered_hnsw_search); BitMapFilterFunctor functor(bitmap); size_t effective_ef = ef > 0 ? ef : settings::DEFAULT_EF_SEARCH; diff --git a/src/main.cpp b/src/main.cpp index 415cefe6fe..934ab1b5aa 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -277,7 +277,6 @@ int main(int argc, char** argv) { {{"status", "ok"}, {"timestamp", (std::int64_t)std::chrono::system_clock::now().time_since_epoch().count()}}); PRINT_LOG_TIME(); - ndd::printSearchTimingStats(); ndd::printSparseSearchDebugStats(); ndd::printSparseUpdateDebugStats(); print_mdbx_stats(); diff --git a/src/utils/search_timing.hpp b/src/utils/search_timing.hpp deleted file mode 100644 index cad40fe5f9..0000000000 --- a/src/utils/search_timing.hpp +++ /dev/null @@ -1,142 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - -namespace ndd { - inline constexpr bool SEARCH_TIMING_ENABLED = false; - - struct SearchTimingCounter { - std::atomic calls{0}; - std::atomic total_ns{0}; - }; - - struct SearchTimingStats { - SearchTimingCounter search_total; - SearchTimingCounter filter_bitmap_compute; - SearchTimingCounter filtered_hnsw_search; - SearchTimingCounter prefilter_total; - SearchTimingCounter prefilter_bitmap_to_ids; - SearchTimingCounter prefilter_direct_mdbx_score; - SearchTimingCounter prefilter_mdbx_get; - SearchTimingCounter prefilter_distance_compute; - std::atomic prefilter_cardinality_total{0}; - std::atomic prefilter_cardinality_max{0}; - }; - - inline SearchTimingStats& searchTimingStats() { - static SearchTimingStats stats; - return stats; - } - - inline timespec searchTimingNow() { - timespec ts{}; - clock_gettime(CLOCK_MONOTONIC, &ts); - return ts; - } - - inline uint64_t searchTimingElapsedNs(const timespec& start, const timespec& end) { - const uint64_t start_ns = - static_cast(start.tv_sec) * 1'000'000'000ULL - + static_cast(start.tv_nsec); - const uint64_t end_ns = - static_cast(end.tv_sec) * 1'000'000'000ULL - + static_cast(end.tv_nsec); - return end_ns >= start_ns ? end_ns - start_ns : 0; - } - - inline void addSearchTiming(SearchTimingCounter& counter, uint64_t elapsed_ns) { - if constexpr(SEARCH_TIMING_ENABLED) { - counter.calls.fetch_add(1, std::memory_order_relaxed); - counter.total_ns.fetch_add(elapsed_ns, std::memory_order_relaxed); - } - } - - class ScopedSearchTiming { - public: - explicit ScopedSearchTiming(SearchTimingCounter& counter) : - counter_(SEARCH_TIMING_ENABLED ? &counter : nullptr) { - if constexpr(SEARCH_TIMING_ENABLED) { - start_ = searchTimingNow(); - } - } - - ~ScopedSearchTiming() { - if constexpr(SEARCH_TIMING_ENABLED) { - addSearchTiming(*counter_, - searchTimingElapsedNs(start_, searchTimingNow())); - } - } - - private: - SearchTimingCounter* counter_{nullptr}; - timespec start_{}; - }; - - inline void recordPrefilterCardinality(size_t cardinality) { - if constexpr(!SEARCH_TIMING_ENABLED) { - return; - } - SearchTimingStats& stats = searchTimingStats(); - stats.prefilter_cardinality_total.fetch_add(static_cast(cardinality), - std::memory_order_relaxed); - - uint64_t current_max = - stats.prefilter_cardinality_max.load(std::memory_order_relaxed); - const uint64_t card = static_cast(cardinality); - while(card > current_max - && !stats.prefilter_cardinality_max.compare_exchange_weak( - current_max, card, std::memory_order_relaxed)) { - } - } - - inline void printSearchTimingStats() { - if constexpr(!SEARCH_TIMING_ENABLED) { - return; - } - SearchTimingStats& stats = searchTimingStats(); - - auto print_counter = [](const char* name, SearchTimingCounter& counter) -> uint64_t { - const uint64_t calls = counter.calls.exchange(0, std::memory_order_relaxed); - const uint64_t total_ns = counter.total_ns.exchange(0, std::memory_order_relaxed); - const double total_ms = static_cast(total_ns) / 1'000'000.0; - const double avg_ms = calls ? total_ms / static_cast(calls) : 0.0; - std::cerr << name << " count: " << calls << '\n'; - std::cerr << name << " total(ms): " - << std::fixed << std::setprecision(3) << total_ms << '\n'; - std::cerr << name << " avg(ms): " - << std::fixed << std::setprecision(3) << avg_ms << '\n'; - return calls; - }; - - std::cerr << "Search timing stats since last healthcheck\n"; - print_counter("search_total", stats.search_total); - print_counter("filter_bitmap_compute", stats.filter_bitmap_compute); - print_counter("filtered_hnsw_search", stats.filtered_hnsw_search); - const uint64_t prefilter_calls = print_counter("prefilter_total", stats.prefilter_total); - print_counter("prefilter_bitmap_to_ids", stats.prefilter_bitmap_to_ids); - print_counter("prefilter_direct_mdbx_score", stats.prefilter_direct_mdbx_score); - print_counter("prefilter_mdbx_get", stats.prefilter_mdbx_get); - print_counter("prefilter_distance_compute", stats.prefilter_distance_compute); - - const uint64_t cardinality_total = - stats.prefilter_cardinality_total.exchange(0, std::memory_order_relaxed); - const uint64_t cardinality_max = - stats.prefilter_cardinality_max.exchange(0, std::memory_order_relaxed); - std::cerr << "prefilter_cardinality total: " << cardinality_total << '\n'; - std::cerr << "prefilter_cardinality max: " << cardinality_max << '\n'; - std::cerr << "prefilter_cardinality avg: " - << std::fixed << std::setprecision(3) - << (prefilter_calls - ? static_cast(cardinality_total) - / static_cast(prefilter_calls) - : 0.0) - << '\n'; - std::cerr << "=================================\n"; - } - -} // namespace ndd From 4bab3b988f4edbdbae7bc1868bf235deffa18671 Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Fri, 15 May 2026 07:43:32 +0000 Subject: [PATCH 22/28] testing (Part 1 subset) Cherry-pick of 02acc13 limited to Part-1-compatible tests. Adds request_validation_test.cpp covering filter parameter validation from 3e33557 and wires it into tests/CMakeLists.txt. The remaining contents of 02acc13 (vector_storage_test.cpp, numeric_index_stress_test.cpp, tests/repo_filter.py, and the new TEST_F additions in filter_test.cpp) exercise Part-2 behavior (bitmap-only bucket state, unified float numeric encoding, upsert cleanup, deleteFilter meta sync) and are deferred to Part 2. --- tests/CMakeLists.txt | 12 ++++ tests/request_validation_test.cpp | 102 ++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 tests/request_validation_test.cpp diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 0793a2e2f3..2d32d9327c 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -36,5 +36,17 @@ target_include_directories(ndd_filter_test PRIVATE # Add other necessary definitions target_compile_definitions(ndd_filter_test PRIVATE MDB_MAXKEYSIZE=512) +add_executable(ndd_request_validation_test request_validation_test.cpp) + +target_link_libraries(ndd_request_validation_test GTest::gtest_main) + +target_include_directories(ndd_request_validation_test PRIVATE + ${CMAKE_SOURCE_DIR}/src + ${CMAKE_SOURCE_DIR}/src/server + ${CMAKE_SOURCE_DIR}/src/utils + ${CROW_INCLUDE_DIR} +) + include(GoogleTest) gtest_discover_tests(ndd_filter_test) +gtest_discover_tests(ndd_request_validation_test) diff --git a/tests/request_validation_test.cpp b/tests/request_validation_test.cpp new file mode 100644 index 0000000000..f080a8f508 --- /dev/null +++ b/tests/request_validation_test.cpp @@ -0,0 +1,102 @@ +#include +#include + +#include "core/types.hpp" +#include "crow/json.h" +#include "server/request_validation.hpp" +#include "settings.hpp" + +namespace { + +const crow::json::rvalue& field_from_json(const std::string& json_body, const char* field_name) { + static crow::json::rvalue body; + body = crow::json::load(json_body); + return body[field_name]; +} + +} // namespace + +TEST(RequestValidationTest, RejectsNegativePrefilterThreshold) { + auto result = ndd::server::parse_bounded_size(field_from_json(R"({"v": -1})", "v"), + "filter_params.prefilter_threshold", + 0, + settings::MAX_VECTORS_ADMIN); + + EXPECT_FALSE(result.ok()); + EXPECT_NE(result.message.find("must be between"), std::string::npos); +} + +TEST(RequestValidationTest, RejectsNegativeBoostPercentage) { + auto result = ndd::server::parse_bounded_size(field_from_json(R"({"v": -1})", "v"), + "filter_params.boost_percentage", + 0, + 100); + + EXPECT_FALSE(result.ok()); + EXPECT_NE(result.message.find("must be between"), std::string::npos); +} + +TEST(RequestValidationTest, AcceptsValidBounds) { + auto zero = ndd::server::parse_bounded_size(field_from_json(R"({"v": 0})", "v"), + "filter_params.prefilter_threshold", + 0, + settings::MAX_VECTORS_ADMIN); + ASSERT_TRUE(zero.ok()) << zero.message; + ASSERT_TRUE(zero.value.has_value()); + EXPECT_EQ(*zero.value, 0u); + + auto max_prefilter = ndd::server::parse_bounded_size( + field_from_json(R"({"v": 1000000000})", "v"), + "filter_params.prefilter_threshold", + 0, + settings::MAX_VECTORS_ADMIN); + ASSERT_TRUE(max_prefilter.ok()) << max_prefilter.message; + ASSERT_TRUE(max_prefilter.value.has_value()); + EXPECT_EQ(*max_prefilter.value, settings::MAX_VECTORS_ADMIN); + + auto max_boost = ndd::server::parse_bounded_size(field_from_json(R"({"v": 100})", "v"), + "filter_params.boost_percentage", + 0, + 100); + ASSERT_TRUE(max_boost.ok()) << max_boost.message; + ASSERT_TRUE(max_boost.value.has_value()); + EXPECT_EQ(*max_boost.value, 100u); +} + +TEST(RequestValidationTest, FilterParamsDefaultsRemainUnchangedWhenAbsent) { + ndd::FilterParams filter_params; + + EXPECT_EQ(filter_params.prefilter_threshold, settings::PREFILTER_CARDINALITY_THRESHOLD); + EXPECT_EQ(filter_params.boost_percentage, settings::FILTER_BOOST_PERCENTAGE); +} + +TEST(RequestValidationTest, RejectsOutOfRangeValues) { + auto prefilter = ndd::server::parse_bounded_size( + field_from_json(R"({"v": 1000000001})", "v"), + "filter_params.prefilter_threshold", + 0, + settings::MAX_VECTORS_ADMIN); + EXPECT_FALSE(prefilter.ok()); + + auto boost = ndd::server::parse_bounded_size(field_from_json(R"({"v": 101})", "v"), + "filter_params.boost_percentage", + 0, + 100); + EXPECT_FALSE(boost.ok()); +} + +TEST(RequestValidationTest, RejectsNonIntegerValues) { + auto floating = ndd::server::parse_bounded_size(field_from_json(R"({"v": 1.5})", "v"), + "filter_params.boost_percentage", + 0, + 100); + EXPECT_FALSE(floating.ok()); + EXPECT_NE(floating.message.find("must be an integer"), std::string::npos); + + auto string_value = ndd::server::parse_bounded_size(field_from_json(R"({"v": "5"})", "v"), + "filter_params.boost_percentage", + 0, + 100); + EXPECT_FALSE(string_value.ok()); + EXPECT_NE(string_value.message.find("must be an integer"), std::string::npos); +} From c2b177867fab6a6a15f8a4fa2903800c14a63474 Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Wed, 13 May 2026 05:59:05 +0000 Subject: [PATCH 23/28] filter docs --- docs/filter.md | 634 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 562 insertions(+), 72 deletions(-) diff --git a/docs/filter.md b/docs/filter.md index fa1fa45bac..b6f401d01f 100644 --- a/docs/filter.md +++ b/docs/filter.md @@ -1,104 +1,594 @@ -# Filter Design & Strategy +# Filters -This document outlines the architectural design for Endee's filtering system, covering component designs for Numeric, Category, and Boolean types, and the overarching execution strategy. +Onboarding guide to the filter subsystem on the `filter_pass` branch. Read this +top-to-bottom. The "Caveats" sections call out behaviours that are +counter-intuitive or that the team has not yet fixed; treat them as load-bearing +context, not nitpicks. -## 1. Global Filtering Strategy +The source files this doc maps to: -The system prioritizes **Pre-Filtering** followed by an adaptive search execution path. +- [src/filter/filter.hpp](../src/filter/filter.hpp), [src/filter/filter.cpp](../src/filter/filter.cpp) — top-level `Filter` class, JSON parsing, schema, query dispatch. +- [src/filter/numeric_index.hpp](../src/filter/numeric_index.hpp), [src/filter/numeric_index.cpp](../src/filter/numeric_index.cpp) — `NumericIndex`, `Bucket`, sortable-key helpers. +- [src/filter/category_index.hpp](../src/filter/category_index.hpp), [src/filter/category_index.cpp](../src/filter/category_index.cpp) — `CategoryIndex`, the bitmap-per-key store used for strings and booleans. +- [src/storage/vector_storage.hpp](../src/storage/vector_storage.hpp) — wires the filter store to vectors and metadata (`store_vectors_batch`, `deleteFilter`, `updateFilter`). +- [src/core/ndd.hpp](../src/core/ndd.hpp) — `searchKNN`, `deleteVectorsByFilter`, `updateFilters`. The adaptive search path lives here. +- [src/core/types.hpp](../src/core/types.hpp) — `FilterParams`. +- [src/hnsw/hnswalg.h](../src/hnsw/hnswalg.h) — HNSW fatigue boost when a filter is active. +- [src/main.cpp](../src/main.cpp) — HTTP layer. -### 1.1. Execution Flow -1. **Filter Analysis:** - * Incoming queries (e.g., `Age: [18-25] AND City: "NY"`) are broken into atomic filter operations. - * **Cardinality Estimation:** Each filter estimates its result set size (e.g., "NY" has 500 users, "Age" has 10k). -2. **Optimization (Cheapest First):** - * Filters are executed in order of increasing cardinality (smallest first). - * Results are intersected (`AND`) incrementally. If the intermediate result becomes empty, execution stops early. -3. **Adaptive Search Path:** - * Final `RoaringBitmap` of valid IDs is passed to the Vector Search engine. - * **Small Result (< 1,000 IDs):** **Bypass HNSW.** Fetch vectors for valid IDs directly and perform Brute Force distance calculation. This avoids graph overhead for sparse results. - * **Large Result:** **Filtered HNSW.** Pass the Bitmap to HNSW's `searchKnn` via `BitMapFilterFunctor`. +--- + +## 1. Big picture + +``` +HTTP main.cpp parses request, builds filter_array JSON + | +IndexManager ndd.hpp searchKNN / insert / delete / update + | +VectorStorage vector_storage.hpp owns filter_store_ + meta_store_ + vector_store_ + | +Filter filter/filter.cpp schema + JSON-to-index dispatch + | | + | +-- NumericIndex numbers (Number = unified int/float) + | +-- CategoryIndex strings + booleans + | +MDBX one filter env per index, multiple named DBIs +``` + +There is **one MDBX environment per index**, opened from +`/filters`, with four named sub-databases: + +| dbi | what it holds | +|------------------|------------------------------------------------------------| +| `` | filter schema JSON under key `__ndd_schema_v1__` | +| `numeric_forward`| `:` -> 4-byte sortable value (current value) | +| `numeric_inverted`| bucket-key -> serialized `Bucket` (the inverted index) | +| `category_idx` | `:` -> serialized RoaringBitmap of ids | + +Geometry is bounded by `settings::FILTER_MAP_SIZE_BITS` / `_MAX_BITS` +(env-overridable via `NDD_FILTER_MAP_SIZE_BITS` / `_MAX_BITS`). Default min is +16 MiB, default max is 64 GiB. --- -## 2. Numeric Filter Design +## 2. Error code contract -*Optimized for range queries, high compression, and sequential access.* +Every public filter call returns `ndd::OperationResult` (defined in +[src/utils/types.hpp](../src/utils/types.hpp)). The codes are a stable contract +between the filter layer and `main.cpp`: -### 2.1. Storage Architecture (Hybrid Bucket) -The database (LMDB) acts as a coarse-grained B+ Tree. NumericIndex opens two MDBX named databases: "numeric_forward" and "numeric_inverted". +| code | meaning | HTTP | +|-----------|------------------------------------------------------------------|------| +| `0` | success | 2xx | +| `1` | invalid JSON shape (not an array / not an object / bad keys) | 400 | +| `2` | unsupported operator or invalid value for the field type | 400 | +| `3` | field type conflict with the persisted schema | 400 | +| `100-199` | MDBX / storage failure | 500 | +| `200-299` | corruption / invariant violation | 500 | -In numeric_inverted -* **Key:** `[FieldID] + [Base_Value_32bit]`. - * Floats are mapped to lexicographically ordered integers to preserve sort order. - * Keys are stored in Big-Endian to support native cursor iteration. -* **Value (Bucket):** Fixed-size block (Max 1024 unique values). - * **Summary Bitmap (Roaring):** Pre-computed union of all IDs in the bucket. Used for $O(1)$ block retrieval during full overlaps. - * **Data Arrays (Structure of Arrays - SoA):** - * **Values:** Compressed as `uint16_t` deltas relative to the Key's `Base_Value`. - * **IDs:** Raw `idInt` array, index-aligned with values. +`main.cpp::operation_error_is_client_error` returns true for `code < 100`. +Doc-comments on every public method spell out the per-call code range. -In numeric_forward -* **Key:** `[field string]:[4-byte big-endian integer from values]` - * Floats are mapped to lexicographically ordered integers to preserve sort order. - * Keys are stored in Big-Endian to support native cursor iteration -* **Value +--- +## 3. Filter schema and field types -### 2.2. Query Execution -* **Buckets Fully Inside Selection (Middle):** Use **Summary Bitmap**. Zero array access. -* **Buckets Partially Overlapping (Edges):** Scan `Values` array (SIMD), use indices to fetch specific `IDs`. +Schema lives in the unnamed MDBX dbi under key `__ndd_schema_v1__` as a JSON +object `{ field_name -> FieldType }`. `FieldType` is: -### 2.3. Constraints & Splitting -* **Split Triggers:** Count > 1024 OR Delta > 65,535. -* **Sliding Split:** To ensure Key Uniqueness in LMDB, splits do not strictly occur at the median. The split point "slides" right to find the first value divergence, ensuring `Key(RightBucket) != Key(LeftBucket)`. +``` +Unknown = 0, String = 1, Number = 2, Bool = 4 +``` + +Two important rules: + +- **First-write-wins.** The first insert that mentions a field freezes its type. + Later inserts that use a different type return code `3`. See + `Filter::register_field_type` in [filter.cpp](../src/filter/filter.cpp). +- **JSON type drives `FieldType`.** `value.is_boolean()` → Bool, + `value.is_number()` → Number, `value.is_string()` → String. There is no way + to override. + +Schema is loaded once on `Filter` construction and cached in +`schema_cache_` under `schema_mutex_`. Every register touches MDBX (one +read-write txn per new field), so first inserts after restart pay a per-field +cost. + +### Caveats + +- The schema persistence is **not** atomic with numeric/category writes. The + schema commit happens inside `register_field_type` during validation + ([filter.cpp:683](../src/filter/filter.cpp#L683)), before any data is + written. A crash between schema commit and data write leaves a "registered + but empty" field. +- Low-level `add_to_filter()` / `add_to_filter_batch()` / + `remove_from_filter()` write directly to the `CategoryIndex` and bypass + schema registration entirely. They will happily create category entries for + a field that the schema (or a later JSON insert) thinks is `Number`. The + high-level `add_filters_from_json[_batch]` is the only schema-aware entry + point. Treat the low-level methods as legacy. --- -## 3. Category Filter Design +## 4. Numbers: one float32 sortable domain + +Every numeric value — both JSON integers and JSON floats — is funneled through +`Filter::sortable_from_json` ([filter.cpp:57](../src/filter/filter.cpp#L57)) +which: -*Optimized for exact match lookups and faceting.* +1. Rejects non-numeric or non-finite values (`code = 2`). +2. Calls `value.get()` (float32, not double). +3. Normalizes signed zero. +4. Passes the float to `float_to_sortable` to get a `uint32_t` that sorts the + same way as the original float. -### 3.1. Interface (MongoDB-Style) -* **Single Value:** `{"City": "NY"}` -* **List Membership ($in):** `{"City": {"$in": ["NY", "London", "Tokyo"]}}` +`float_to_sortable` is the standard IEEE-754 trick: flip all bits if the sign +bit is set, otherwise flip just the sign bit +([numeric_index.cpp:21](../src/filter/numeric_index.cpp#L21)). It makes the +representation lexicographically ordered, which means we can scan inverted +buckets with a normal MDBX cursor and get range semantics. -### 3.2. Storage Architecture -Utilizes Inverted Indices with **Text-Based Keys** to enable prefix scanning and faceting. -* **Key:** `[FieldName] + ":" + [Value]`. - * **Parsing Logic:** The system strictly splits on the **first** occurrence of `:`. - * **Format:** `City:New:York` is parsed as Field=`City`, Value=`New:York`. - * **Constraints:** `FieldName` must **not** contain the `:` character (alphanumeric + underscore recommended). `Value` can contain any character including `:`. -* **Value:** `RoaringBitmap` (Serialized). Contains all IDs that have this attribute value. +There is also `int_to_sortable` in the same file. **It is no longer used by +inserts or queries.** All numeric paths go through `float_to_sortable`. The +function is left in the source for tests and for a potential future +"true integer" type. -### 3.3. Query Execution -* **Exact Match:** Direct Key lookup. -* **$in Query:** - 1. Parse the list `["NY", "London"]`. - 2. Perform multiple Key lookups. - 3. Compute the **Union** of the resulting Bitmaps efficiently. +### Caveats (read this before debugging an off-by-one) + +- **float32 precision.** Above `2^24 = 16,777,216`, not every integer is + representable in float32. `1 vs 1.0` compare equal (good) but + `16_777_217 vs 16_777_216` collapse to the same key (bad). The doc comment + above `sortable_from_json` spells this out. +- **Strict comparisons (`$gt`, `$lt`)** use `std::nextafterf` on the float32 + bound. The "next representable" gap grows with magnitude, so the bound for + `$gt 1e20` is very different from the bound for `$gt 1.0`. See + `Filter::numeric_bound_from_comparison` in + [filter.cpp:117](../src/filter/filter.cpp#L117). +- **Migration.** Older DBs that wrote integers through `int_to_sortable` will + not interoperate with the float32 sortable keys. The numeric index has no + version field; the only currently-supported migration is "rebuild the index." + Both `numeric_index.cpp` and the inline comment in + [filter.cpp:94](../src/filter/filter.cpp#L94) call this out. +- **Bucket density.** The float bit domain is less uniformly dense in the + integer range than `int_to_sortable` was. Integer-heavy fields will create + more buckets and walk more entries on wide range scans. +- **Large JSON integers.** `category_value_from_json` calls + `value.get()` for integer category values + ([filter.cpp:91](../src/filter/filter.cpp#L91)). Values outside `int` are + unsafe (nlohmann throws on overflow; we do not catch with a code-2 message + yet). --- -## 4. Boolean Filter Design +## 5. Numeric inverted index + +Owned by `NumericIndex`. The data model is a B+-tree of fixed-width buckets +keyed by `:`. + +### 5.1 Bucket layout + +```cpp +struct Bucket { + static constexpr size_t MAX_SIZE = 1024; // soft cap on ids.size() + static constexpr uint32_t MAX_DELTA = 65535; // u16 max + uint32_t base_value = 0; // runtime only + + std::vector deltas; // sorted ascending + std::vector ids; // index-aligned with deltas + ndd::RoaringBitmap summary_bitmap; // union of all ids +}; +``` + +Serialization (see `Bucket::serialize` / +`Bucket::deserialize` in [numeric_index.cpp:162](../src/filter/numeric_index.cpp#L162)): + +``` +[uint32_t bm_size][bitmap bytes][deltas (N * u16)][ids (N * u32)] +``` + +`N` is **derived** from the residual bytes after the bitmap: +`(iov_len - 4 - bm_size) / (sizeof(u16) + sizeof(idInt))`. The branch removed +the explicit count field — this is what the `e9cca02 numeric filters using +only floats` and the bitmap-only-bucket fix commits depend on, because it lets +`ids.size()` transiently exceed `MAX_SIZE` (slide-split fallthrough) without +overflowing a stored count. + +### 5.2 Inserts + +`NumericIndex::put_internal` ([numeric_index.cpp:720](../src/filter/numeric_index.cpp#L720)): + +1. Look up the forward entry `:`. If present with the same value, + no-op. If different, remove the id from its old bucket. +2. Upsert the forward entry to the new value. +3. Call `add_to_buckets` to add the id to the correct inverted bucket. + +`add_to_buckets` ([numeric_index.cpp:448](../src/filter/numeric_index.cpp#L448)) +walks back from `MDBX_SET_RANGE` to find the predecessor bucket whose +`[base, base+MAX_DELTA]` covers the value. If no such bucket exists, it +creates one keyed at the exact value. If the matching bucket is at +`MAX_SIZE`, it runs the **slide split**. + +### 5.3 Slide split -*Optimized for extreme density ops.* +A bucket whose `ids.size()` reaches `MAX_SIZE` (1024) is split at a +**value boundary**, not the median. We scan right (then left) from the median +to find the first index where `deltas[i] != deltas[i-1]`, then split there. +This guarantees the right bucket's key (`base + delta[split]`) differs from +the left bucket's key, so MDBX never sees duplicate keys. -### 4.1. Storage Architecture -Treated as a specialized Category filter with strictly two possible keys per field. -* **Keys:** `[FieldName]:0` (False) and `[FieldName]:1` (True). - * Consistent with the text-based key design (uses `:` separator). -* **Value:** `RoaringBitmap`. +If the bucket is **all duplicates of `base_value`** (no value boundary +anywhere), the split cannot succeed. We fall through and just append the new +entry, letting the bucket sit momentarily over `MAX_SIZE`: -### 4.2. Strategy -Boolean filters are typically low-selectivity (often matching ~50% of the DB). They are processed **Last** in the intersection chain unless statistics indicate high skew (e.g., `Is_Active` is true for 99% of data, so filtering for `False` is fast). +- If the new value equals `base_value`, the duplicate run extends and the + fallthrough repeats on the next insert. +- If the new value is greater than `base_value`, the bucket now has a value + boundary; the very next insert into this bucket will slide-split cleanly. + +This is the path that creates **bitmap-only ids** (see next section). + +### 5.4 Saturated-duplicate path / bitmap-only ids + +`Bucket::add` ([numeric_index.cpp:91](../src/filter/numeric_index.cpp#L91)) +has this branch: + +```cpp +if (delta_32 == 0 && ids.size() >= MAX_SIZE) { + return; // id only goes into summary_bitmap +} +``` + +When the bucket is saturated and the incoming value equals `base_value`, the +id is added to `summary_bitmap` only. The arrays don't grow. The bitmap is +the source of truth for membership. + +Three places that depend on this: + +1. **Range scan** ([numeric_index.cpp:1011](../src/filter/numeric_index.cpp#L1011)) + handles `bucket.ids.empty()` but `summary_bitmap` non-empty: include the + bitmap iff `base_value` is in `[min_val, max_val]`. +2. **Partial-overlap scan** + ([numeric_index.cpp:1049](../src/filter/numeric_index.cpp#L1049)) reconstructs + the bitmap-only subset by `summary_bitmap` minus `{ ids[i] : deltas[i] != 0 }`. +3. **Slide split** ([numeric_index.cpp:629](../src/filter/numeric_index.cpp#L629)) + computes the left bucket's bitmap as `original_bitmap - right_bucket.ids` + instead of rebuilding it from `ids[]`, which would lose bitmap-only entries. + +### Caveats + +- **`Bucket::is_empty()` looks at both `ids.empty()` and + `summary_bitmap.isEmpty()`** ([numeric_index.cpp:306](../src/filter/numeric_index.cpp#L306)). + This was a fix on this branch. Older versions only looked at `ids`, which + would let a delete operation delete a bucket that still had bitmap-only ids. +- **Bucket size is not page-bounded.** `summary_bitmap` size depends on the + user-space insertion pattern, not the entry count. A high-cardinality + bucket can be much larger than an MDBX page. There is a TODO in the header + to bound buckets by page size; today they are bounded only by + `MAX_SIZE = 1024` on the array side. +- **Bitmap-only partial-overlap is expensive.** The reconstruction at + [numeric_index.cpp:1069](../src/filter/numeric_index.cpp#L1069) copies the + full bitmap then `remove()`s every delta-zero entry. For a bucket dominated + by saturated duplicates this is a real cost. + +### 5.5 Range scan: fast path + +`NumericIndex::range` ([numeric_index.cpp:902](../src/filter/numeric_index.cpp#L902)) +walks buckets forward from the start of the query. For every bucket whose +**entire `[base, base + MAX_DELTA]` extent** lies inside `[min_val, max_val]`, +it skips the full deserialize and reads only the `summary_bitmap` +(`Bucket::read_summary_bitmap`). This fires on every interior bucket of a wide +scan and is the reason wide ranges only pay deltas/ids parsing on the start +and end buckets. + +**Caveat:** the fast path is conservative — it requires the **declared +extent** to be covered, not the actual `[bucket_min, bucket_max]`. A bucket +packed tightly inside its extent still pays the deserialize unless the whole +65 K-wide window is inside the query. The TODO is to store actual bucket +min/max in the bucket header. + +### 5.6 Batch writes + +`NumericIndex::put_batch` ([numeric_index.cpp:800](../src/filter/numeric_index.cpp#L800)) +commits in **chunks of `BATCH_TXN_CHUNK_SIZE = 256`**. This caps each +write transaction's dirty-page footprint so MDBX cannot blow past the env +map size on a multi-thousand-entry batch (the `750e5d8` commit). The +trade-off is that the batch is not atomic across chunks. + +--- + +## 6. Category / boolean index + +`CategoryIndex` ([category_index.cpp](../src/filter/category_index.cpp)) +maps a formatted key `:` to a `RoaringBitmap`. Booleans are +treated as a category with values `"0"` / `"1"`. + +``` +add(field, value, id): + 1) txn: read bitmap for key (read-only txn) + 2) bitmap.add(id) + 3) txn: write bitmap (read-write txn) +``` + +Two transactions. `remove` is the same shape with `bitmap.remove(id)`. +`add_batch_by_key` uses `addMany` so the in-memory union is O(N) instead of +N individual `add()`s. + +### Caveats + +- **Read-modify-write across two txns is not atomic.** Two concurrent + `add()` calls to the same key can produce a lost update (writer B's read + predates writer A's commit). High-write workloads on a hot category need + external serialization until this moves into a single txn. +- **The whole bitmap is rewritten on every `add`/`remove`.** For a hot + category with millions of ids this is wasteful. Tracked in the perf TODO + list (see [filter_todo.md](filter_todo.md)). +- **Empty keys are not garbage collected.** Removing the last id from a key + leaves an empty bitmap in MDBX. +- **`$in` with an empty string is silently skipped.** `computeFilterBitmap` + skips category values whose string form is empty + ([filter.cpp:465](../src/filter/filter.cpp#L465)). An empty-string match + cannot be expressed in the current shape. --- -## 5. Schema & Type Enforcement +## 7. Bitmap deserialization safety + +Both indexes use a hardened deserialization helper: + +- `Bucket::read_bitmap_payload` in + [numeric_index.cpp:46](../src/filter/numeric_index.cpp#L46). +- `CategoryIndex::read_bitmap_payload` in + [category_index.cpp:14](../src/filter/category_index.cpp#L14). + +Both follow the same pattern: + +1. `roaring_bitmap_portable_deserialize_size(bytes, len)` to verify the + payload self-describes a complete bitmap with no trailing junk. +2. `RoaringBitmap::readSafe(bytes, len)` (the bounds-checked deserializer). +3. `roaring_bitmap_internal_validate` to catch malformed run/array + containers. + +Any failure returns `code = 200`. This is the `a46d0b8 safe filter bitmap +deserialization` commit. Before this landed, a corrupt or empty bucket +payload could be silently treated as an empty bitmap; now it surfaces as a +corruption error. + +--- + +## 8. Query API and operators + +Top-level entry points on `Filter`: + +- `computeFilterBitmap(filter_array)` — returns the bitmap of ids matching + the AND of all clauses. +- `getIdsMatchingFilter(filter_array)` — same, materialized as a vector. +- `countIdsMatchingFilter(filter_array)` — same, materialized as a size_t. +- `check_numeric(field, id, op, val)` — fast point check via the forward + index; used by `VectorStorage::matches_filter`. + +`filter_array` is a JSON **array** of single-field objects. Each clause uses +a Mongo-style `$op`: + +```jsonc +[ + { "category": { "$eq": "books" } }, + { "in_stock": { "$eq": true } }, + { "price": { "$range": [10, 50] } }, + { "rating": { "$gte": 4.0 } }, + { "discount": { "$lt": 20 } }, + { "tags": { "$in": ["sale", "new"] } } +] +``` + +Operators supported (`computeFilterBitmap` in +[filter.cpp:372](../src/filter/filter.cpp#L372)): + +| operator | types | notes | +|------------|---------------|-----------------------------------------------------------------------| +| `$eq` | any | numeric → `range(v, v)`; category → bitmap lookup. | +| `$in` | any | array; numeric → per-item range; category → per-value bitmap union. | +| `$range` | Number | `[start, end]` inclusive in float32-sortable order. Errors if start > end. | +| `$lt` | Number | uses `nextafterf(x, -inf)` to make the bound exclusive. | +| `$lte` | Number | inclusive. | +| `$gt` | Number | uses `nextafterf(x, +inf)` to make the bound exclusive. | +| `$gte` | Number | inclusive. | + +After all clauses are evaluated, partial bitmaps are sorted by cardinality +ascending and AND-intersected smallest-first. The intersection short-circuits +as soon as the result is empty. + +### Caveats + +- **All clauses materialize before intersecting.** There is no cardinality + estimator and no "cheapest first" lazy evaluation despite what the older + doc claimed. Every clause runs an MDBX read pass on its own + read-only transaction; only **after** all of them complete does the AND + start. See [filter.cpp:521-545](../src/filter/filter.cpp#L521). +- **No clause-level shared snapshot.** Each `$eq`/`$in`/`$range`/`$lt..` + opens its own MDBX read txn. A concurrent writer can produce a result that + mixes pre- and post-write snapshots across clauses. The operation lock in + `searchKNN` is also intentionally disabled + ([ndd.hpp:1633](../src/core/ndd.hpp#L1633)), so reads do not serialize + against writes either. +- **Field name and `$in` value validation:** the field name must not contain + `:` (it is the key delimiter). Same rule for category values. + `validate_filter_key_component` ([filter.cpp:30](../src/filter/filter.cpp#L30)) + rejects on `:` and returns code `1`. Length, NUL bytes, control bytes, and + MDBX max-key are **not** validated. Category values are capped at 255 chars + ([filter.cpp:96](../src/filter/filter.cpp#L96)). +- **The old doc said category values may contain `:`.** They cannot. Code is + authoritative; this version of the doc reflects the code. +- **Schema is only consulted via `schema_cache_` during search.** If a query + arrives before any insert has touched the field, `type` defaults to + `Unknown` and the query falls through to the category branch, which will + just return an empty bitmap. + +--- + +## 9. Vector storage integration + +`VectorStorage` owns the `Filter` instance (`filter_store_`) and is the only +caller that needs to keep three stores (`vector_store_`, `meta_store_`, +`filter_store_`) in sync. The two pieces worth knowing: + +### 9.1 Upsert cleanup (`store_vectors_batch`) + +Implemented in +[vector_storage.hpp:781](../src/storage/vector_storage.hpp#L781). Four phases: + +1. **Cleanup** — for every entry whose `is_new_to_db[i] == false` (i.e. the + id-mapper says this id was already live), read its prior `meta.filter` + and call `filter_store_->remove_filters_from_json(...)` to drop the old + filter index entries. Without this, a "rename" upsert leaves the old + filter still matchable. +2. **Vectors** — `vector_store_->store_vectors_batch`. +3. **Meta** — `meta_store_->store_meta_batch` (this is the moment + `meta.filter` becomes the new value; cleanup HAD to happen before this). +4. **Filters** — `filter_store_->add_filters_from_json_batch`. + +The `is_new_to_db` vector is the id-mapper's signal: + +- `true` → fresh slot, or reuse of a deleted slot. Nothing to clean. +- `false` → existing live id, an upsert. Cleanup required. +- empty → legacy caller; cleanup is silently skipped to preserve old + semantics. New callers always pass the signal. + +### 9.2 `deleteFilter` + +[vector_storage.hpp:1049](../src/storage/vector_storage.hpp#L1049). Removes +filter index entries AND clears `meta.filter` (only when it exactly matches +the input). This is the `b0e8425` commit — before it, `deleteFilter` only +touched the index, leaving `meta.filter` populated and drifted. + +### Caveats + +- **Cross-store atomicity is by design absent.** Vector, meta, filter, and + schema writes each commit in their own MDBX txn. A crash between phases + leaves torn state: e.g. the cleanup phase committed but phase 4 never ran, + so the index entries are gone for a vector that still claims (via + `meta.filter`) to have them. The operator-visible remedy is rebuild. +- **`store_vectors_batch` issues one extra MDBX read per upserted id** to + fetch the prior `meta.filter`. Fresh inserts skip this. Heavy upsert + workloads should expect that overhead. +- **The cleanup pass only protects new writes.** Drift accumulated before + this branch landed will not be fixed automatically. A targeted rebuild is + required to clean it up. +- **`meta.filter` is the source of truth for cleanup.** If `meta` is + unreadable for a live id (torn earlier write), `store_vectors_batch` + returns code `103` instead of silently overwriting — better to surface the + inconsistency than to make it worse. + +--- + +## 10. Search: filter-aware path + +`IndexManager::searchKNN` in +[ndd.hpp:1614](../src/core/ndd.hpp#L1614). When `filter_array` is non-empty: + +1. Compute the filter bitmap. +2. If sparse search is enabled, run the sparse query in another thread, with + the filter bitmap passed in. +3. For the dense path, branch on the bitmap's cardinality (`card`): + - `card == 0` → no dense results. + - `card < params.prefilter_threshold` → **brute force on the small set**. + Iterate the bitmap into `valid_ids`, visit those vectors via + `visit_vectors_by_ids`, compute distances directly, keep a top-`k` heap. + This bypasses HNSW. + - Otherwise → **filtered HNSW**. Pass a `BitMapFilterFunctor` and + `params.boost_percentage` to `HierarchicalNSW::searchKnn`. + +`FilterParams` ([core/types.hpp](../src/core/types.hpp)): + +```cpp +struct FilterParams { + size_t prefilter_threshold = settings::PREFILTER_CARDINALITY_THRESHOLD; // default 10_000 + size_t boost_percentage = settings::FILTER_BOOST_PERCENTAGE; // default 0 +}; +``` + +Both are accepted from the HTTP body under `filter_params` (see +[main.cpp:839](../src/main.cpp#L839)). + +### HNSW fatigue boost + +When `filter_boost_percentage > 0` and a filter is active, +`HierarchicalNSW::searchKnn` ([hnswalg.h:1490](../src/hnsw/hnswalg.h#L1490)) +inflates the early-exit budget by `(100 + boost) / 100`. The intuition: with +a filter the graph is rejecting more candidates, so it pays to explore more +before giving up. Set `boost_percentage > 0` if recall drops on filtered +queries. + +### Caveats + +- **The operation lock is intentionally disabled in search** + ([ndd.hpp:1632-1637](../src/core/ndd.hpp#L1632)). The comment is explicit: + "We aren't using reader's lock here to enable reads while writing. + TODO: check correctness when stressing the system." Filter results can be + inconsistent under concurrent writes. +- **The brute-force branch reads vectors one-by-one via a visitor.** This is + fast for sparse filters (small `card`) but degrades sharply if you raise + `prefilter_threshold` past a few tens of thousands. +- **HNSW filter functors are called inside the inner search loop** — keep + `bitmap.contains(id)` cheap. `BitMapFilterFunctor` wraps `RoaringBitmap` + which is already fast, but custom functors should not allocate. +- **`matches_filter`** in + [vector_storage.hpp:622](../src/storage/vector_storage.hpp#L622) is a + separate point-check API for callers that already have a vector in hand + (e.g. recovery). It tries the index for numeric clauses and parses + `meta.filter` JSON for string/bool clauses. It is NOT used by the main + search path. + +--- + +## 11. Limits and validation summary + +What the public surface will reject (code `1` or `2`): + +- `filter_array` not a JSON array. +- Any clause that is not a single-field object. +- Field name empty or containing `:`. +- `$op` value's JSON type mismatching the operator (`$range` not a 2-array, + `$in` not an array, comparison value not a number). +- Non-finite numbers. +- Category values that are not string/integer/boolean, or that exceed 255 + bytes, or that contain `:`. +- `$range` with `start > end`. +- `$lt`/`$lte`/`$gt`/`$gte` on a non-Number field. + +What it will **not** reject yet: + +- Overly long field names. +- NUL or control bytes in field name or value. +- Keys that exceed MDBX max-key-size (manifests as code 100 from MDBX, not + code 1). +- Two distinct large integers that collapse to the same float32 key (silent; + see §4 caveats). + +--- + +## 12. Open work / where the bodies are buried + +Treat [docs/filter_todo.md](filter_todo.md) and +[docs/filter_issue_drafts.md](filter_issue_drafts.md) as the authoritative +TODO list. Highlights: -To ensure index integrity without a strict schema registry, the system adheres to **First-Write Wins** typing. +- **Atomicity.** Schema, numeric, category, and meta writes are independent + txns. Crash recovery needs a journal or single-txn execution; meanwhile + rebuild is the only safe recovery. +- **Concurrent category writes.** Read-modify-write across two txns can + drop concurrent updates to the same key. +- **Search snapshot consistency.** Filter clauses each open their own read + txn, and `searchKNN` skips the operation lock. Multi-clause results may + mix snapshots. +- **Schema bypass by low-level category APIs.** `add_to_filter()` does not + consult the schema. +- **Numeric bucket format is unversioned.** Old DBs need rebuild; tests + explicitly reject the legacy count-prefixed payload. +- **Fast path is coarse.** Tight buckets inside their extent still + deserialize. +- **`$in` issues one MDBX read txn per value.** Batch under one txn. +- **Cardinality estimator does not exist.** All partial bitmaps materialize + before intersection. -* **Immutable Types:** Once a `FieldName` is indexed with a specific type (Numeric, Category, or Boolean), that type is bound to the field. -* **Validation Logic:** - * If `is_active` is first seen as **Boolean**, subsequent attempts to insert `is_active: "yes"` (Category) or `is_active: 1` (Numeric bucket) must be rejected. - * This prevents storage corruption and ambiguous query parsing. +If you are about to land filter changes, scan §3-9 caveats first and check +whether your change closes any of them. From e56debede50aaf88a2d14274290471e9716e4fad Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Wed, 13 May 2026 17:06:49 +0530 Subject: [PATCH 24/28] mac compile time flags to use xcrun to find the correct clang version --- CMakeLists.txt | 76 ++++++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 37 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5fa15b0d93..7ae6c10a39 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,54 +1,56 @@ -# For Server x86_64 install clang for fp16 support and use the following commands cmake_minimum_required(VERSION 3.14) -project(ndd) - - -#check for the correct clang version -find_program(CLANG_CANDIDATE_C - NAMES clang-21 clang-20 clang-19 clang -) -find_program(CLANG_CANDIDATE_CXX - NAMES clang++-21 clang++-20 clang++-19 clang++ -) +# Select a Clang toolchain before project() so CMake configures the matching +# standard library and platform SDK paths. +if(NOT DEFINED CMAKE_C_COMPILER AND NOT DEFINED ENV{CC}) + if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Darwin") + execute_process( + COMMAND xcrun --find clang + OUTPUT_VARIABLE CLANG_CANDIDATE_C + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_QUIET + ) + else() + find_program(CLANG_CANDIDATE_C NAMES clang-21 clang-20 clang-19 clang) + endif() -if (NOT CLANG_CANDIDATE_C OR NOT CLANG_CANDIDATE_CXX) - message(FATAL_ERROR "Clang not found. Please install clang >= 19.") + if(CLANG_CANDIDATE_C) + set(CMAKE_C_COMPILER "${CLANG_CANDIDATE_C}" CACHE FILEPATH "C compiler") + endif() endif() -# Query clang version -execute_process( - COMMAND ${CLANG_CANDIDATE_C} --version - OUTPUT_VARIABLE CLANG_VERSION_OUTPUT - ERROR_VARIABLE CLANG_VERSION_ERROR - OUTPUT_STRIP_TRAILING_WHITESPACE -) +if(NOT DEFINED CMAKE_CXX_COMPILER AND NOT DEFINED ENV{CXX}) + if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Darwin") + execute_process( + COMMAND xcrun --find clang++ + OUTPUT_VARIABLE CLANG_CANDIDATE_CXX + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_QUIET + ) + else() + find_program(CLANG_CANDIDATE_CXX NAMES clang++-21 clang++-20 clang++-19 clang++) + endif() -if (NOT CLANG_VERSION_OUTPUT) - message(FATAL_ERROR "Failed to query clang version.") + if(CLANG_CANDIDATE_CXX) + set(CMAKE_CXX_COMPILER "${CLANG_CANDIDATE_CXX}" CACHE FILEPATH "C++ compiler") + endif() endif() -# Extract major version -string(REGEX MATCH "clang version ([0-9]+)" _match "${CLANG_VERSION_OUTPUT}") -set(CLANG_VERSION_MAJOR "${CMAKE_MATCH_1}") +project(ndd) -if (NOT CLANG_VERSION_MAJOR) - message(FATAL_ERROR - "Unable to determine clang version from:\n${CLANG_VERSION_OUTPUT}\n" - ) +if(NOT CMAKE_C_COMPILER_ID MATCHES "Clang" OR NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") + message(FATAL_ERROR "Clang >= 17 is required. Set CC/CXX or CMAKE_C_COMPILER/CMAKE_CXX_COMPILER to a Clang toolchain.") endif() -if (CLANG_VERSION_MAJOR LESS 17) - message(FATAL_ERROR - "Clang ${CLANG_VERSION_MAJOR} detected, but clang >= 17 is required " - ) +if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 17) + message(FATAL_ERROR "Clang ${CMAKE_CXX_COMPILER_VERSION} detected, but Clang >= 17 is required.") endif() -# Lock compilers -set(CMAKE_C_COMPILER "${CLANG_CANDIDATE_C}" CACHE STRING "" FORCE) -set(CMAKE_CXX_COMPILER "${CLANG_CANDIDATE_CXX}" CACHE STRING "" FORCE) +if(APPLE AND (CMAKE_C_COMPILER MATCHES "[/\\\\]Android[/\\\\].*[/\\\\]ndk[/\\\\]" OR CMAKE_CXX_COMPILER MATCHES "[/\\\\]Android[/\\\\].*[/\\\\]ndk[/\\\\]")) + message(FATAL_ERROR "Android NDK Clang was selected for a macOS build. Use Xcode Command Line Tools Clang, for example: CC=$(xcrun --find clang) CXX=$(xcrun --find clang++) cmake ...") +endif() -message(STATUS "Using Clang ${CLANG_VERSION_MAJOR}") +message(STATUS "Using ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") message(STATUS "C compiler : ${CMAKE_C_COMPILER}") message(STATUS "C++ compiler: ${CMAKE_CXX_COMPILER}") From fb9e34cd2a1e809357f1efe0bdde852b41e524e0 Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Fri, 15 May 2026 07:46:14 +0000 Subject: [PATCH 25/28] docs: Part 2 follow-ups deferred from filter_safety Records the four filter_pass commits skipped from the Part-1 split (546430d, b0e8425, e9cca02, 4cb445d), the hpp->cpp refactor (7743296) deferred to be bundled with the bucket layout change, and the Part-2 test files split out of 02acc13. Documents the Part-1 carry-forwards (Bucket count field, sortable_from_json int branch) that exist to keep filter_safety byte-compatible with master-built indexes and that Part 2 should remove. --- docs/filter_part2_followups.md | 197 +++++++++++++++++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 docs/filter_part2_followups.md diff --git a/docs/filter_part2_followups.md b/docs/filter_part2_followups.md new file mode 100644 index 0000000000..14b197c50c --- /dev/null +++ b/docs/filter_part2_followups.md @@ -0,0 +1,197 @@ +# Filter `filter_safety` → `filter_pass` (Part 2) follow-ups + +This document is the running record of everything from the `filter_pass` branch that **was not** brought into `filter_safety` (the Part-1 split). Each entry lists the original commit, what it does, why it was deferred, the on-disk or behavioral contract it changes, and the exact follow-up work needed when Part 2 is opened. + +`filter_safety` ends at the commit `e56debe mac compile time flags to use xcrun to find the correct clang version`. Part 2 should branch from `filter_safety` (not from `master` or `filter_pass`) so that it inherits the validation, perf, and refactor work that Part 1 already paid for. + +> **Why two parts at all?** +> Part 1 is byte-compatible with filter indexes built by `master`. A deployment can drop in `filter_safety` without rebuilding any data. Part 2 changes the on-disk bucket layout, the numeric sortable-key domain, and the upsert semantics — none of those can ship without a rebuild step, and bundling them with Part 1 would have forced every existing deployment to reindex just to pick up the `$gt`/`$lt` operators or the validation fixes. + +--- + +## Index — Part 2 deferred items + +1. [`546430d` — Numeric bucket on-disk layout (drops `count`, adds bitmap-only routing)](#1-546430d--numeric-bucket-on-disk-layout) +2. [`b0e8425` — Upsert cleanup pass and `deleteFilter` meta sync](#2-b0e8425--upsert-cleanup-and-deletefilter-meta-sync) +3. [`e9cca02` — Unified float32 numeric sortable domain](#3-e9cca02--unified-float32-numeric-sortable-domain) +4. [`4cb445d` — Query/removal/split handling of bitmap-only state](#4-4cb445d--bitmap-only-state-in-query--removal--split) +5. [`7743296` — `filter` headers split into `.cpp` + `.hpp`](#5-7743296--header--cpp-split-for-filter-) +6. [Part-2 portion of `02acc13` — vector_storage / numeric stress / repo_filter.py tests + Part-2 cases in `filter_test.cpp`](#6-02acc13-part-2-tests) + +Cross-cutting items that Part 1 *temporarily* preserved to stay backward compatible and that Part 2 should clean up: + +- [Part-1 carry: `count` field in `Bucket` serialization](#carry-1-bucket-count-field) +- [Part-1 carry: Read-side bitmap-only handling comment in `read_summary_bitmap`](#carry-2-read_summary_bitmap-comment) +- [Part-1 carry: `int_to_sortable` / `float_to_sortable` split in `sortable_from_json`](#carry-3-int_to_sortable--float_to_sortable-split) + +--- + +## 1. `546430d` — Numeric bucket on-disk layout + +**Why deferred**: changes the bytes on disk. A `master`-built bucket cannot be read by a branch carrying this commit; the new deserializer throws `"Bucket corrupt: residual bytes not aligned"`. + +**What it changes** + +- Bucket serialization format: + - Old (Part 1): `[bm_size : u32][bitmap][count : u16][deltas : count*u16][ids : count*idInt]` + - New (Part 2): `[bm_size : u32][bitmap][deltas][ids]` — `count` removed; recovered from `(iov_len - sizeof(u32) - bm_size) / (sizeof(u16) + sizeof(idInt))`. +- New saturated-duplicate routing in `Bucket::add` — when `delta_32 == 0` and `ids.size() >= MAX_SIZE`, the new id is added to `summary_bitmap` only (bitmap-only state) and the parallel arrays are not grown. +- `Bucket::remove` reads `summary_bitmap` as the source of truth instead of relying on a successful scan of `ids[]`. + +**Affected files**: [src/filter/numeric_index.hpp](src/filter/numeric_index.hpp) (or its `.cpp` after item 5 lands). + +**Part 2 checklist** +- [ ] Drop `count` field from `Bucket::serialize` / `Bucket::deserialize`. Make sure the residual-byte math in the new deserializer matches what `read_summary_bitmap` already expects. +- [ ] Implement the duplicate-cliff fix (`delta_32 == 0 && ids.size() >= MAX_SIZE` → bitmap-only insert). +- [ ] Implement the new `Bucket::remove` semantics (bitmap is source of truth; arrays cleaned best-effort). +- [ ] Provide a migration path: either bump a stored version sentinel and refuse to open old buckets, or perform an on-open conversion. Currently Part 1 silently rounds-trips old buckets through Part-1 serialize/deserialize. +- [ ] Pair this commit with item 4 (`4cb445d`) — query / range / split semantics depend on bitmap-only state existing. +- [ ] Remove the `Why count is intentionally ignored here` comment block in `read_summary_bitmap` (see [carry 2](#carry-2-read_summary_bitmap-comment)) — after this commit the comment is obsolete; replace it with a one-line note that residual bytes are pure data arrays. + +--- + +## 2. `b0e8425` — Upsert cleanup and `deleteFilter` meta sync + +**Why deferred**: not a format change, but the patch itself states: +> "This patch only prevents NEW stale filter index entries from accumulating. It does not retroactively scrub entries left behind by previous upserts written before the fix landed. **A targeted rebuild is required to clean historical drift.**" + +Any deployment that ran upserts on `master` carries stale filter index entries that no code path will clean up. To make the fix *correct* for those deployments, the operator must rebuild — same reindex requirement as a format change. + +**What it changes** + +- `store_vectors_batch(...)` gains a second parameter `const std::vector& is_new_to_db = {}`. When non-empty it must be the same size as `vectors`. The flag mirrors id_mapper's "was this str_id already mapped?" signal. +- New phase-1 cleanup pass: for each entry where `is_new_to_db[i] == false`, fetch the prior `meta.filter` via `meta_store_->get_meta(numeric_id).filter` and remove its category / numeric index entries before the new filter is written. +- `deleteFilter(...)` now clears `meta.filter` so subsequent `get_meta(...)` calls see the post-delete state. +- New error code path `103` ("Upsert cleanup: meta missing for numeric_id ...") for the torn-write case where id_mapper says the slot is live but meta cannot be loaded. + +**Affected files**: [src/storage/vector_storage.hpp](src/storage/vector_storage.hpp), [src/core/ndd.hpp](src/core/ndd.hpp). + +**Part 2 checklist** +- [ ] Add the `is_new_to_db` parameter and per-entry size check to `store_vectors_batch`. +- [ ] Implement the cleanup pass and the `103` error path. +- [ ] Update `ndd.hpp` call sites to thread the id_mapper signal (`numeric_ids[i].second`) into a parallel `is_new_to_db` vector. +- [ ] Implement `deleteFilter` meta sync. +- [ ] Document the rebuild requirement in the Part 2 release notes — operators upgrading from Part 1 with any history of upserts must reindex. +- [ ] Bring back `tests/vector_storage_test.cpp` from `filter_pass` (currently skipped — see item 6) and confirm it passes. + +--- + +## 3. `e9cca02` — Unified float32 numeric sortable domain + +**Why deferred**: encoding change. The commit message itself says +> "Existing filter DBs that indexed integers with `int_to_sortable` must be rebuilt." + +A `master`-built field that indexed integers stores keys like `int_to_sortable(2) = 0x80000002`. The new code reads queries through `float_to_sortable(2.0f) = 0xC0000000`. These don't compare equal, so old indexes return wrong answers under the new query path. + +**What it changes** + +- `Filter::sortable_from_json` no longer special-cases `is_number_integer()`; every JSON numeric goes through `float_to_sortable(value.get())`. +- `Filter::numeric_bound_from_comparison` ($gt / $gte / $lt / $lte) drops the integer-specific branches and uses `std::nextafterf` for strict bounds on all values. +- Rejects non-finite floats with HTTP 400 ("$op value must be a finite number"). + +**Affected files**: [src/filter/filter.cpp](src/filter/filter.cpp) and [src/filter/filter.hpp](src/filter/filter.hpp). + +**Part 2 checklist** +- [ ] Replace the `is_number_integer()` branch in `sortable_from_json` with the unified float32 path. +- [ ] Strip the integer special-case from each of `$gt`, `$gte`, `$lt`, `$lte`. +- [ ] Add the finite-float32 check and the warn/reject path. +- [ ] Update the docstring on `sortable_from_json` and `numeric_bound_from_comparison` (the long version is in `e9cca02`'s diff — paste verbatim). +- [ ] Re-introduce the two Part-2 tests in `filter_test.cpp`: `IntegerIndexedNumericFieldCanBeQueriedWithFloatNumber` and `FloatIndexedNumericFieldCanBeQueriedWithIntegerNumber`, plus the `NumericRangeBench.FloatDomainVsIntegerDomain` benchmark. + +--- + +## 4. `4cb445d` — Bitmap-only state in query / removal / split + +**Why deferred**: mechanically a query-side change with no format change, but it is only load-bearing once item 1 (`546430d`) is in. Without bitmap-only state existing on disk, this commit is a no-op. Keeping it in Part 2 keeps the bucket logic coherent inside one PR. + +**What it changes** + +- `Bucket::is_empty()` from `return ids.empty();` to `return ids.empty() && summary_bitmap.isEmpty();` +- Slide-split bitmap rebuild stops doing `bitmap = empty; for(id : ids) bitmap.add(id)` — instead, subtracts only the ids that moved right (preserves delta-0 bitmap-only entries on the left). +- `range()` slow path: when `bucket.ids.empty()` but `summary_bitmap` is non-empty, include the bitmap iff `base_value ∈ [min_val, max_val]`. + +**Affected files**: [src/filter/numeric_index.cpp](src/filter/numeric_index.cpp) (or [.hpp](src/filter/numeric_index.hpp) if item 5 hasn't landed yet). + +**Part 2 checklist** +- [ ] Apply alongside item 1, not before. Verify the three test cases from `filter_test.cpp` (`SplitPreservesBitmapOnlyDuplicates`, `RemoveKeepsBucketAliveWithBitmapOnlyEntries`, `RangeSlowPathReturnsBitmapOnlyEntries`) pass after both land. + +--- + +## 5. `7743296` — Header → cpp split for `filter/` + +**Why deferred**: pure refactor (zero behavioral content), but the cpp file in `filter_pass` was authored *after* item 1 (`546430d`) landed in that branch, so it contains the Part-2 implementations of `Bucket::add`, `remove`, `serialize`, `deserialize`, `add_to_buckets`, and `range`. Cherry-picking it would either drag Part-2 semantics into Part 1, or force us to rewrite ~7 method bodies by hand. We chose to keep the implementations in `.hpp` for Part 1 and let Part 2 carry the split. + +**What it changes** + +- New files: `src/filter/category_index.cpp`, `src/filter/filter.cpp`, `src/filter/numeric_index.cpp`. +- `CMakeLists.txt`: new `ndd_filter` library target (`add_library(ndd_filter STATIC ${NDD_FILTER_SOURCES})`) with its own include directories and `MDB_MAXKEYSIZE=512` definition. +- All the corresponding `.hpp` files reduced to declarations only. + +**Affected files**: `CMakeLists.txt`, `src/filter/*.hpp`, new `src/filter/*.cpp`. + +**Part 2 checklist** +- [ ] Land the split *together with* items 1 and 4 (`546430d` + `4cb445d`) so the new `.cpp` files contain the Part-2 implementations from the start. This is the cleanest reorg: one commit replaces the inlined Part-1 bodies with declarations and drops the Part-2 bodies into the new `.cpp` files. +- [ ] Re-introduce the `add_library(ndd_filter STATIC ...)` target and its compile options. `-falign-functions=64` is already on all three Part-1 targets (`ndd_core`, `ndd_filter`-elect, `${NDD_BINARY_NAME}`) per the user's CMake decision — no flag work needed, just wire the new target. +- [ ] When the bucket implementation moves into the new `.cpp`, the comment block on `read_summary_bitmap` ([carry 2](#carry-2-read_summary_bitmap-comment)) should be condensed — the count field will be gone by then. + +--- + +## 6. `02acc13` Part-2 tests + +**Why deferred**: half of `02acc13`'s test content depends on Part-2 behavior and would not compile or pass against Part 1. We split the commit at cherry-pick time and committed only the Part-1 portion as `4bab3b9 testing (Part 1 subset)`. + +**Already in Part 1** (`4bab3b9`): +- [tests/request_validation_test.cpp](tests/request_validation_test.cpp) — tests for `3e33557` filter parameter validation. +- The new `add_executable(ndd_request_validation_test ...)` block in [tests/CMakeLists.txt](tests/CMakeLists.txt) and the corresponding `gtest_discover_tests`. + +**Deferred to Part 2**: +- `tests/vector_storage_test.cpp` — exercises `store_vectors_batch(..., is_new_to_db)` upsert cleanup and `deleteFilter` meta sync. Direct dependency on item 2. +- `tests/numeric_index_stress_test.cpp` — random churn + drain phase, asserts the forward↔inverted invariant across the bitmap-only / split paths. Direct dependency on items 1 and 4. +- `tests/repo_filter.py` — reproducer for the "65,536 duplicate cliff" fixed by item 1. +- Six `TEST_F` additions to `tests/filter_test.cpp`: + - `IntegerIndexedNumericFieldCanBeQueriedWithFloatNumber` (item 3) + - `FloatIndexedNumericFieldCanBeQueriedWithIntegerNumber` (item 3) + - `SplitPreservesBitmapOnlyDuplicates` (items 1 + 4) + - `RemoveKeepsBucketAliveWithBitmapOnlyEntries` (item 4) + - `RangeSlowPathReturnsBitmapOnlyEntries` (item 4) + - `NumericRangeBench.FloatDomainVsIntegerDomain` (item 3, benchmark) +- The `add_executable(ndd_vector_storage_test ...)` and `add_executable(ndd_numeric_index_stress_test ...)` blocks in [tests/CMakeLists.txt](tests/CMakeLists.txt) and their `gtest_discover_tests` calls. + +**Part 2 checklist** +- [ ] When items 1–4 are all in, cherry-pick the *original* `02acc13` on top to recover the deferred files. The conflicts with `4bab3b9` will be small (the includes block at the top of `filter_test.cpp` is already taken; `tests/CMakeLists.txt` already has the request_validation_test block — just re-add the other two executables). + +--- + +## Cross-cutting "Part 1 paid for backward compatibility" items + +These were not deferred — they were *added* in Part 1 specifically so Part 1 could ship without a rebuild. Part 2 will simplify or remove them. + +### Carry 1: Bucket `count` field + +`Bucket::serialize` still writes a `uint16_t count` after the bitmap and before the deltas / ids arrays, and `Bucket::deserialize` still reads it. Master and pre-Part-1 deployments rely on this field, so Part 1 keeps it. Part 2 item 1 removes it. + +**Why we kept it**: an existing on-disk bucket carries the `count` field. If Part 1 dropped it from the serializer, every round-trip (read + modify + write) would corrupt the bucket. The byte-length-derived count (item 1's approach) only works for buckets written by the new serializer. + +### Carry 2: `read_summary_bitmap` comment + +[src/filter/numeric_index.hpp](src/filter/numeric_index.hpp) lines ~219–263 carries a multi-paragraph comment on `read_summary_bitmap` that explains why the `count` field is intentionally not read by this function, and explicitly forecasts its removal. When item 1 lands, that comment becomes obsolete and should be condensed to a one-liner. + +### Carry 3: `int_to_sortable` / `float_to_sortable` split + +[src/filter/filter.hpp](src/filter/filter.hpp) `Filter::sortable_from_json` (and the four comparison operators in `numeric_bound_from_comparison`) still branch on `is_number_integer()`. This preserves the dual-domain encoding that master uses. Part 2 item 3 unifies these onto `float_to_sortable`. + +--- + +## Verification snapshot (taken at end of Part 1) + +These invariants must continue to hold at the tip of `filter_safety` until Part 2 begins: + +- `Bucket::is_empty()` returns `ids.empty()` only — no bitmap check. +- `Bucket::serialize` writes `count` and `Bucket::deserialize` reads it (file: [src/filter/numeric_index.hpp](src/filter/numeric_index.hpp)). +- `Filter::sortable_from_json` has the `is_number_integer()` → `int_to_sortable` branch. +- `store_vectors_batch` is single-argument (no `is_new_to_db`). +- `Bucket::add` has no saturated-duplicate / bitmap-only path; every insert goes into the parallel arrays. +- `add_to_buckets` slide-split rebuilds the left bitmap from `ids[]` (the Part-1 way), not by subtracting moved ids. +- `tests/vector_storage_test.cpp` and `tests/numeric_index_stress_test.cpp` do not exist in the working tree. + +If any of these flip during a future Part-1 maintenance commit, that commit has accidentally pulled in Part-2 semantics and should be reverted. From afab9c46d68d57daccebe3b5937f84513b36ade7 Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Fri, 15 May 2026 07:53:53 +0000 Subject: [PATCH 26/28] tests: skip Part-2 regression alarms with GTEST_SKIP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three Hypothesis tests from a46d0b8 (safe filter bitmap deserialization) assert behavior that only exists after Part 2: - Hypothesis2.SaturationCreatesBitmapOnlyEntries — expects Bucket::add to route delta-0 inserts past MAX_SIZE into the summary bitmap (546430d). - Hypothesis4.DeserializeRejectsLegacyCountFormat — expects the count-less deserializer to reject the legacy on-disk shape (546430d). - Hypothesis4.ReadSummaryBitmapRejectsLegacyCountFormat — expects read_summary_bitmap to reject the same shape via an alignment check; Part 1 intentionally removed that check because the count field is still part of the layout. Each test now calls GTEST_SKIP() with a message pointing at docs/filter_part2_followups.md. Part 2 must remove these skips when the underlying fixes land. --- docs/filter_part2_followups.md | 1 + tests/filter_test.cpp | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/docs/filter_part2_followups.md b/docs/filter_part2_followups.md index 14b197c50c..0e77e84edb 100644 --- a/docs/filter_part2_followups.md +++ b/docs/filter_part2_followups.md @@ -47,6 +47,7 @@ Cross-cutting items that Part 1 *temporarily* preserved to stay backward compati - [ ] Provide a migration path: either bump a stored version sentinel and refuse to open old buckets, or perform an on-open conversion. Currently Part 1 silently rounds-trips old buckets through Part-1 serialize/deserialize. - [ ] Pair this commit with item 4 (`4cb445d`) — query / range / split semantics depend on bitmap-only state existing. - [ ] Remove the `Why count is intentionally ignored here` comment block in `read_summary_bitmap` (see [carry 2](#carry-2-read_summary_bitmap-comment)) — after this commit the comment is obsolete; replace it with a one-line note that residual bytes are pure data arrays. +- [ ] Remove the `GTEST_SKIP()` calls from `Hypothesis2.SaturationCreatesBitmapOnlyEntries`, `Hypothesis4.DeserializeRejectsLegacyCountFormat`, and `Hypothesis4.ReadSummaryBitmapRejectsLegacyCountFormat` in `tests/filter_test.cpp` — these are Part-2 regression alarms that Part 1 silenced because they assert behavior that doesn't exist yet. After this commit they must pass. --- diff --git a/tests/filter_test.cpp b/tests/filter_test.cpp index 81cf0155ce..c130386e04 100644 --- a/tests/filter_test.cpp +++ b/tests/filter_test.cpp @@ -688,6 +688,11 @@ TEST(Hypothesis1, FastPathFiresOnWidelySpreadBucket) { // and the new bitmap-only-inclusion branch in range() returns ids // that the OLD code would never have surfaced. TEST(Hypothesis2, SaturationCreatesBitmapOnlyEntries) { + GTEST_SKIP() << "Part 2 alarm: Bucket::add saturated-duplicate routing " + "to summary_bitmap is introduced by 546430d. See " + "docs/filter_part2_followups.md item 1. Remove this " + "GTEST_SKIP when Part 2 lands."; + constexpr uint32_t base = 0; constexpr ndd::idInt N_TOTAL = ndd::filter::Bucket::MAX_SIZE + 500; @@ -778,6 +783,14 @@ TEST(Hypothesis3, SlideSplitRebuildLosesBitmapOnlyEntries) { // can grow the range result candidate set. The production reader now // rejects that payload shape instead of trying to salvage it. TEST(Hypothesis4, DeserializeRejectsLegacyCountFormat) { + GTEST_SKIP() << "Part 2 alarm: legacy count-bearing layout is still " + "the on-disk format in Part 1, so Bucket::deserialize " + "accepts it. Part 2 commit 546430d drops the count " + "field; the residual-bytes-not-aligned check then " + "rejects the legacy shape. See " + "docs/filter_part2_followups.md item 1. Remove this " + "GTEST_SKIP when Part 2 lands."; + // Manually craft an OLD-format payload: // [u32 bm_size] [bitmap bytes] [u16 count=0] // i.e. cliff-truncated count, but bitmap retained the lost ids. @@ -811,6 +824,15 @@ TEST(Hypothesis4, DeserializeRejectsLegacyCountFormat) { // reject the same legacy-format payloads as the full deserializer, so // the fast path cannot silently reintroduce compatibility. TEST(Hypothesis4, ReadSummaryBitmapRejectsLegacyCountFormat) { + GTEST_SKIP() << "Part 2 alarm: read_summary_bitmap intentionally " + "ignores the count-bearing trailer in Part 1 (see " + "the comment block on read_summary_bitmap in " + "numeric_index.hpp). Part 2 commit 546430d drops the " + "count field and the alignment check then catches " + "the legacy shape. See docs/filter_part2_followups.md " + "item 1 and carry 2. Remove this GTEST_SKIP when " + "Part 2 lands."; + ndd::RoaringBitmap original; for (ndd::idInt i = 0; i < 50; ++i) original.add(i * 3); original.runOptimize(); From 89e32ea2225b669c4dc1f02b250adba5c5a2d23c Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Fri, 15 May 2026 07:55:28 +0000 Subject: [PATCH 27/28] filter bucket format followup --- ...filter_part2_followups.md => filter_bucket_format_followup.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/{filter_part2_followups.md => filter_bucket_format_followup.md} (100%) diff --git a/docs/filter_part2_followups.md b/docs/filter_bucket_format_followup.md similarity index 100% rename from docs/filter_part2_followups.md rename to docs/filter_bucket_format_followup.md From d1a55222a7124ea23c6201695c0bc4a694e77d97 Mon Sep 17 00:00:00 2001 From: Shaleen Garg Date: Fri, 15 May 2026 08:25:50 +0000 Subject: [PATCH 28/28] filter: split headers into hpp + cpp Move the implementations of CategoryIndex, NumericIndex, Bucket, and Filter from their respective headers into new translation units. The headers now expose only types, declarations, and the tiny inline accessors (sortable_from_float family, Bucket::get_value / is_full / is_empty). Behavior is unchanged; this is a build-time refactor. Define NDD_FILTER_SOURCES once in the root CMakeLists.txt and pull it into both NDD_CORE_SOURCES (for the main binary) and the ndd_filter_test target so the implementations are linked in both places. Add #include to settings.hpp. It uses std::thread::hardware_concurrency() but was relying on a transitive include from the old filter.hpp; the trimmed filter.hpp no longer pulls in , so the test build broke without this fix. Verified: ndd_filter_test (42 pass, 7 skip, 0 fail) and ndd_request_validation_test (6 pass, 0 fail) match the pre-split results; ndd-avx2 builds clean. --- CMakeLists.txt | 10 + src/filter/category_index.cpp | 267 ++++++++++ src/filter/category_index.hpp | 242 +-------- src/filter/filter.cpp | 806 ++++++++++++++++++++++++++++++ src/filter/filter.hpp | 780 +---------------------------- src/filter/numeric_index.cpp | 905 ++++++++++++++++++++++++++++++++++ src/filter/numeric_index.hpp | 898 ++------------------------------- src/utils/settings.hpp | 1 + tests/CMakeLists.txt | 7 +- 9 files changed, 2063 insertions(+), 1853 deletions(-) create mode 100644 src/filter/category_index.cpp create mode 100644 src/filter/filter.cpp create mode 100644 src/filter/numeric_index.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ae6c10a39..f7a77efecf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -252,10 +252,20 @@ endif() message(STATUS "Binary name: ${NDD_BINARY_NAME}") +# Filter sources are split out so filter headers expose declarations only. +# Tests and the main binary both pull these in via NDD_FILTER_SOURCES. +# Use absolute paths so the list is portable to subdirectories like tests/. +set(NDD_FILTER_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/src/filter/category_index.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/filter/filter.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/filter/numeric_index.cpp +) + # Add new src/*.cpp files here when they should be compiled into ndd. set(NDD_CORE_SOURCES src/sparse/inverted_index.cpp src/utils/system_sanity/system_sanity.cpp + ${NDD_FILTER_SOURCES} ) # Build non-main project sources separately so they can be compiled in parallel diff --git a/src/filter/category_index.cpp b/src/filter/category_index.cpp new file mode 100644 index 0000000000..804fe3b4f6 --- /dev/null +++ b/src/filter/category_index.cpp @@ -0,0 +1,267 @@ +#include "category_index.hpp" + +#include +#include + +namespace ndd { + namespace filter { + + std::string CategoryIndex::format_filter_key(const std::string& field, + const std::string& value) { + return field + ":" + value; + } + + ndd::OperationResult + CategoryIndex::read_bitmap_payload(const void* data, size_t len) { + if(data == nullptr || len == 0) { + return {200, "empty bitmap payload"}; + } + + const char* bytes = static_cast(data); + const size_t consumed = + roaring::api::roaring_bitmap_portable_deserialize_size(bytes, len); + if(consumed == 0) { + return {200, "invalid or truncated bitmap payload"}; + } + if(consumed != len) { + return {200, + "bitmap payload length mismatch: consumed " + + std::to_string(consumed) + " of " + + std::to_string(len) + " bytes"}; + } + + ndd::RoaringBitmap bitmap; + try { + bitmap = ndd::RoaringBitmap::readSafe(bytes, len); + } catch(const std::exception& e) { + return {200, + "failed to deserialize bitmap payload: " + std::string(e.what())}; + } + + const char* reason = nullptr; + if(!roaring::api::roaring_bitmap_internal_validate(&bitmap.roaring, &reason)) { + return {200, + std::string("invalid bitmap internals") + + (reason != nullptr ? ": " + std::string(reason) : "")}; + } + return {SUCCESS, "", std::move(bitmap)}; + } + + ndd::OperationResult + CategoryIndex::get_bitmap_internal(const std::string& filter_key) const { + MDBX_txn* txn = nullptr; + int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn); + if(rc != MDBX_SUCCESS) { + return {100, + "Failed to begin category bitmap read transaction: " + + std::string(mdbx_strerror(rc))}; + } + + MDBX_val key{const_cast(filter_key.c_str()), filter_key.size()}; + MDBX_val data; + + rc = mdbx_get(txn, dbi_, &key, &data); + if(rc == MDBX_NOTFOUND || (rc == MDBX_SUCCESS && data.iov_len == 0)) { + mdbx_txn_abort(txn); + return {SUCCESS, "", ndd::RoaringBitmap()}; + } + if(rc != MDBX_SUCCESS) { + mdbx_txn_abort(txn); + return {100, + "Failed to read category bitmap key '" + filter_key + + "': " + std::string(mdbx_strerror(rc))}; + } + + auto bitmap_result = read_bitmap_payload(data.iov_base, data.iov_len); + if(!bitmap_result.ok()) { + mdbx_txn_abort(txn); + return {bitmap_result.code, + "Corrupt category bitmap payload for key '" + filter_key + + "': " + bitmap_result.message}; + } + if(!bitmap_result.value.has_value()) { + mdbx_txn_abort(txn); + return {200, "Category bitmap reader succeeded without a bitmap for key '" + + filter_key + "'"}; + } + mdbx_txn_abort(txn); + return {SUCCESS, "", std::move(*bitmap_result.value)}; + } + + ndd::OperationResult<> + CategoryIndex::store_bitmap_internal(const std::string& filter_key, + const ndd::RoaringBitmap& bitmap) { + size_t required_size = bitmap.getSizeInBytes(); + if(required_size == 0) { + return {200, "Invalid category bitmap serialization size for key '" + + filter_key + "'"}; + } + + std::vector buffer(required_size); + bitmap.write(buffer.data(), true); + + MDBX_val key{const_cast(filter_key.c_str()), filter_key.size()}; + MDBX_val data{const_cast(buffer.data()), buffer.size()}; + + MDBX_txn* txn = nullptr; + int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); + if(rc != MDBX_SUCCESS) { + return {100, + "Failed to begin category bitmap write transaction: " + + std::string(mdbx_strerror(rc))}; + } + + rc = mdbx_put(txn, dbi_, &key, &data, MDBX_UPSERT); + if(rc != MDBX_SUCCESS) { + mdbx_txn_abort(txn); + return {100, "Failed to store category bitmap key '" + filter_key + + "': " + std::string(mdbx_strerror(rc))}; + } + + rc = mdbx_txn_commit(txn); + if(rc != MDBX_SUCCESS) { + return {100, + "Failed to commit category bitmap write transaction: " + + std::string(mdbx_strerror(rc))}; + } + return {SUCCESS, ""}; + } + + CategoryIndex::CategoryIndex(MDBX_env* env) : + env_(env) { + MDBX_txn* txn = nullptr; + int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); + if(rc != MDBX_SUCCESS) { + throw std::runtime_error( + std::string("Failed to begin txn for CategoryIndex init: ") + + mdbx_strerror(rc)); + } + + rc = mdbx_dbi_open(txn, "category_idx", MDBX_CREATE, &dbi_); + if(rc != MDBX_SUCCESS) { + mdbx_txn_abort(txn); + throw std::runtime_error(std::string("Failed to open category_idx dbi: ") + + mdbx_strerror(rc)); + } + + rc = mdbx_txn_commit(txn); + if(rc != MDBX_SUCCESS) { + throw std::runtime_error(std::string("Failed to commit CategoryIndex init: ") + + mdbx_strerror(rc)); + } + } + + ndd::OperationResult> + CategoryIndex::scan_values(const std::string& field) const { + std::vector values; + MDBX_txn* txn = nullptr; + int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn); + if(rc != MDBX_SUCCESS) { + return {100, + "Failed to begin category value scan transaction: " + + std::string(mdbx_strerror(rc))}; + } + + MDBX_cursor* cursor = nullptr; + rc = mdbx_cursor_open(txn, dbi_, &cursor); + if(rc != MDBX_SUCCESS) { + mdbx_txn_abort(txn); + return {100, + "Failed to open category value scan cursor: " + + std::string(mdbx_strerror(rc))}; + } + + std::string prefix = field + ":"; + MDBX_val key{const_cast(prefix.c_str()), prefix.size()}; + MDBX_val data; + + rc = mdbx_cursor_get(cursor, &key, &data, MDBX_SET_RANGE); + while(rc == MDBX_SUCCESS) { + std::string found_key(static_cast(key.iov_base), key.iov_len); + if(found_key.rfind(prefix, 0) != 0) { + break; + } + + values.push_back(found_key.substr(prefix.size())); + rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT); + } + + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); + + if(rc != MDBX_SUCCESS && rc != MDBX_NOTFOUND) { + return {100, "Failed during category value scan: " + + std::string(mdbx_strerror(rc))}; + } + return {SUCCESS, "", std::move(values)}; + } + + ndd::OperationResult + CategoryIndex::get_bitmap(const std::string& field, const std::string& value) const { + return get_bitmap_internal(format_filter_key(field, value)); + } + + ndd::OperationResult + CategoryIndex::get_bitmap_by_key(const std::string& key) const { + return get_bitmap_internal(key); + } + + ndd::OperationResult<> + CategoryIndex::add(const std::string& field, const std::string& value, ndd::idInt id) { + std::string filter_key = format_filter_key(field, value); + auto bitmap_result = get_bitmap_internal(filter_key); + if(!bitmap_result.ok()) { + return {bitmap_result.code, bitmap_result.message}; + } + + bitmap_result.value_or_throw().add(id); + return store_bitmap_internal(filter_key, bitmap_result.value_or_throw()); + } + + ndd::OperationResult<> + CategoryIndex::remove(const std::string& field, + const std::string& value, + ndd::idInt id) { + std::string filter_key = format_filter_key(field, value); + auto bitmap_result = get_bitmap_internal(filter_key); + if(!bitmap_result.ok()) { + return {bitmap_result.code, bitmap_result.message}; + } + + bitmap_result.value_or_throw().remove(id); + return store_bitmap_internal(filter_key, bitmap_result.value_or_throw()); + } + + ndd::OperationResult + CategoryIndex::contains(const std::string& field, + const std::string& value, + ndd::idInt id) const { + auto bitmap_result = get_bitmap_internal(format_filter_key(field, value)); + if(!bitmap_result.ok()) { + return {bitmap_result.code, bitmap_result.message}; + } + return {SUCCESS, "", bitmap_result.value_or_throw().contains(id)}; + } + + ndd::OperationResult<> + CategoryIndex::add_batch_by_key(const std::string& key, + const std::vector& ids) { + if(ids.empty()) { + return {SUCCESS, ""}; + } + auto bitmap_result = get_bitmap_internal(key); + if(!bitmap_result.ok()) { + return {bitmap_result.code, bitmap_result.message}; + } + + bitmap_result.value_or_throw().addMany(ids.size(), ids.data()); + return store_bitmap_internal(key, bitmap_result.value_or_throw()); + } + + std::string CategoryIndex::make_key(const std::string& field, + const std::string& value) { + return format_filter_key(field, value); + } + + } // namespace filter +} // namespace ndd diff --git a/src/filter/category_index.hpp b/src/filter/category_index.hpp index 1fa8bca606..3d3878ca54 100644 --- a/src/filter/category_index.hpp +++ b/src/filter/category_index.hpp @@ -1,8 +1,6 @@ #pragma once -#include #include -#include #include #include "mdbx/mdbx.h" @@ -18,45 +16,10 @@ namespace ndd { MDBX_dbi dbi_; static std::string format_filter_key(const std::string& field, - const std::string& value) { - return field + ":" + value; - } + const std::string& value); static ndd::OperationResult - read_bitmap_payload(const void* data, size_t len) { - if(data == nullptr || len == 0) { - return {200, "empty bitmap payload"}; - } - - const char* bytes = static_cast(data); - const size_t consumed = - roaring::api::roaring_bitmap_portable_deserialize_size(bytes, len); - if(consumed == 0) { - return {200, "invalid or truncated bitmap payload"}; - } - if(consumed != len) { - return {200, - "bitmap payload length mismatch: consumed " - + std::to_string(consumed) + " of " - + std::to_string(len) + " bytes"}; - } - - ndd::RoaringBitmap bitmap; - try { - bitmap = ndd::RoaringBitmap::readSafe(bytes, len); - } catch(const std::exception& e) { - return {200, - "failed to deserialize bitmap payload: " + std::string(e.what())}; - } - - const char* reason = nullptr; - if(!roaring::api::roaring_bitmap_internal_validate(&bitmap.roaring, &reason)) { - return {200, - std::string("invalid bitmap internals") - + (reason != nullptr ? ": " + std::string(reason) : "")}; - } - return {SUCCESS, "", std::move(bitmap)}; - } + read_bitmap_payload(const void* data, size_t len); /* * Loads the bitmap stored for a formatted category filter key. @@ -67,45 +30,7 @@ namespace ndd { * 200 = corrupt stored bitmap payload; caller should log ERROR and return HTTP 500 */ ndd::OperationResult - get_bitmap_internal(const std::string& filter_key) const { - MDBX_txn* txn = nullptr; - int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn); - if(rc != MDBX_SUCCESS) { - return {100, - "Failed to begin category bitmap read transaction: " - + std::string(mdbx_strerror(rc))}; - } - - MDBX_val key{const_cast(filter_key.c_str()), filter_key.size()}; - MDBX_val data; - - rc = mdbx_get(txn, dbi_, &key, &data); - if(rc == MDBX_NOTFOUND || (rc == MDBX_SUCCESS && data.iov_len == 0)) { - mdbx_txn_abort(txn); - return {SUCCESS, "", ndd::RoaringBitmap()}; - } - if(rc != MDBX_SUCCESS) { - mdbx_txn_abort(txn); - return {100, - "Failed to read category bitmap key '" + filter_key - + "': " + std::string(mdbx_strerror(rc))}; - } - - auto bitmap_result = read_bitmap_payload(data.iov_base, data.iov_len); - if(!bitmap_result.ok()) { - mdbx_txn_abort(txn); - return {bitmap_result.code, - "Corrupt category bitmap payload for key '" + filter_key - + "': " + bitmap_result.message}; - } - if(!bitmap_result.value.has_value()) { - mdbx_txn_abort(txn); - return {200, "Category bitmap reader succeeded without a bitmap for key '" - + filter_key + "'"}; - } - mdbx_txn_abort(txn); - return {SUCCESS, "", std::move(*bitmap_result.value)}; - } + get_bitmap_internal(const std::string& filter_key) const; /* * Stores the bitmap for a formatted category filter key. @@ -116,68 +41,10 @@ namespace ndd { * 200 = invalid bitmap serialization; caller should log ERROR and return HTTP 500 */ ndd::OperationResult<> store_bitmap_internal(const std::string& filter_key, - const ndd::RoaringBitmap& bitmap) { - size_t required_size = bitmap.getSizeInBytes(); - if(required_size == 0) { - return {200, "Invalid category bitmap serialization size for key '" - + filter_key + "'"}; - } - - std::vector buffer(required_size); - bitmap.write(buffer.data(), true); - - MDBX_val key{const_cast(filter_key.c_str()), filter_key.size()}; - MDBX_val data{const_cast(buffer.data()), buffer.size()}; - - MDBX_txn* txn = nullptr; - int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); - if(rc != MDBX_SUCCESS) { - return {100, - "Failed to begin category bitmap write transaction: " - + std::string(mdbx_strerror(rc))}; - } - - rc = mdbx_put(txn, dbi_, &key, &data, MDBX_UPSERT); - if(rc != MDBX_SUCCESS) { - mdbx_txn_abort(txn); - return {100, "Failed to store category bitmap key '" + filter_key - + "': " + std::string(mdbx_strerror(rc))}; - } - - rc = mdbx_txn_commit(txn); - if(rc != MDBX_SUCCESS) { - return {100, - "Failed to commit category bitmap write transaction: " - + std::string(mdbx_strerror(rc))}; - } - return {SUCCESS, ""}; - } + const ndd::RoaringBitmap& bitmap); public: - CategoryIndex(MDBX_env* env) : - env_(env) { - MDBX_txn* txn = nullptr; - int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); - if(rc != MDBX_SUCCESS) { - throw std::runtime_error( - std::string("Failed to begin txn for CategoryIndex init: ") - + mdbx_strerror(rc)); - } - - // Open named DB for category/boolean - rc = mdbx_dbi_open(txn, "category_idx", MDBX_CREATE, &dbi_); - if(rc != MDBX_SUCCESS) { - mdbx_txn_abort(txn); - throw std::runtime_error(std::string("Failed to open category_idx dbi: ") - + mdbx_strerror(rc)); - } - - rc = mdbx_txn_commit(txn); - if(rc != MDBX_SUCCESS) { - throw std::runtime_error(std::string("Failed to commit CategoryIndex init: ") - + mdbx_strerror(rc)); - } - } + CategoryIndex(MDBX_env* env); /* * Lists all unique category values stored for one field. @@ -187,49 +54,7 @@ namespace ndd { * 100 = MDBX transaction, cursor, or scan failure; caller should log ERROR and return HTTP 500 */ ndd::OperationResult> - scan_values(const std::string& field) const { - std::vector values; - MDBX_txn* txn = nullptr; - int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn); - if(rc != MDBX_SUCCESS) { - return {100, - "Failed to begin category value scan transaction: " - + std::string(mdbx_strerror(rc))}; - } - - MDBX_cursor* cursor = nullptr; - rc = mdbx_cursor_open(txn, dbi_, &cursor); - if(rc != MDBX_SUCCESS) { - mdbx_txn_abort(txn); - return {100, - "Failed to open category value scan cursor: " - + std::string(mdbx_strerror(rc))}; - } - - std::string prefix = field + ":"; - MDBX_val key{const_cast(prefix.c_str()), prefix.size()}; - MDBX_val data; - - rc = mdbx_cursor_get(cursor, &key, &data, MDBX_SET_RANGE); - while(rc == MDBX_SUCCESS) { - std::string found_key(static_cast(key.iov_base), key.iov_len); - if(found_key.rfind(prefix, 0) != 0) { - break; - } - - values.push_back(found_key.substr(prefix.size())); - rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT); - } - - mdbx_cursor_close(cursor); - mdbx_txn_abort(txn); - - if(rc != MDBX_SUCCESS && rc != MDBX_NOTFOUND) { - return {100, "Failed during category value scan: " - + std::string(mdbx_strerror(rc))}; - } - return {SUCCESS, "", std::move(values)}; - } + scan_values(const std::string& field) const; /* * Loads the bitmap for one category field/value pair. @@ -240,9 +65,7 @@ namespace ndd { * 200-299 = propagated corruption/invariant failure from the bitmap read helper */ ndd::OperationResult - get_bitmap(const std::string& field, const std::string& value) const { - return get_bitmap_internal(format_filter_key(field, value)); - } + get_bitmap(const std::string& field, const std::string& value) const; /* * Loads the bitmap for an already formatted category key. @@ -253,9 +76,7 @@ namespace ndd { * 200-299 = propagated corruption/invariant failure from the bitmap read helper */ ndd::OperationResult - get_bitmap_by_key(const std::string& key) const { - return get_bitmap_internal(key); - } + get_bitmap_by_key(const std::string& key) const; /* * Adds one id to a category field/value bitmap. @@ -266,16 +87,7 @@ namespace ndd { * 200-299 = propagated corruption/invariant failure from bitmap read/write helpers */ ndd::OperationResult<> - add(const std::string& field, const std::string& value, ndd::idInt id) { - std::string filter_key = format_filter_key(field, value); - auto bitmap_result = get_bitmap_internal(filter_key); - if(!bitmap_result.ok()) { - return {bitmap_result.code, bitmap_result.message}; - } - - bitmap_result.value_or_throw().add(id); - return store_bitmap_internal(filter_key, bitmap_result.value_or_throw()); - } + add(const std::string& field, const std::string& value, ndd::idInt id); /* * Removes one id from a category field/value bitmap. @@ -286,16 +98,7 @@ namespace ndd { * 200-299 = propagated corruption/invariant failure from bitmap read/write helpers */ ndd::OperationResult<> - remove(const std::string& field, const std::string& value, ndd::idInt id) { - std::string filter_key = format_filter_key(field, value); - auto bitmap_result = get_bitmap_internal(filter_key); - if(!bitmap_result.ok()) { - return {bitmap_result.code, bitmap_result.message}; - } - - bitmap_result.value_or_throw().remove(id); - return store_bitmap_internal(filter_key, bitmap_result.value_or_throw()); - } + remove(const std::string& field, const std::string& value, ndd::idInt id); /* * Checks whether one id is present in a category field/value bitmap. @@ -306,13 +109,7 @@ namespace ndd { * 200-299 = propagated corruption/invariant failure from the bitmap read helper */ ndd::OperationResult - contains(const std::string& field, const std::string& value, ndd::idInt id) const { - auto bitmap_result = get_bitmap_internal(format_filter_key(field, value)); - if(!bitmap_result.ok()) { - return {bitmap_result.code, bitmap_result.message}; - } - return {SUCCESS, "", bitmap_result.value_or_throw().contains(id)}; - } + contains(const std::string& field, const std::string& value, ndd::idInt id) const; /* * Adds a batch of ids to an already formatted category key. @@ -323,23 +120,10 @@ namespace ndd { * 200-299 = propagated corruption/invariant failure from bitmap read/write helpers */ ndd::OperationResult<> - add_batch_by_key(const std::string& key, const std::vector& ids) { - if(ids.empty()) { - return {SUCCESS, ""}; - } - auto bitmap_result = get_bitmap_internal(key); - if(!bitmap_result.ok()) { - return {bitmap_result.code, bitmap_result.message}; - } - - bitmap_result.value_or_throw().addMany(ids.size(), ids.data()); - return store_bitmap_internal(key, bitmap_result.value_or_throw()); - } + add_batch_by_key(const std::string& key, const std::vector& ids); // Expose key formatting for external batching logic - static std::string make_key(const std::string& field, const std::string& value) { - return format_filter_key(field, value); - } + static std::string make_key(const std::string& field, const std::string& value); MDBX_dbi get_dbi() const { return dbi_; } }; diff --git a/src/filter/filter.cpp b/src/filter/filter.cpp new file mode 100644 index 0000000000..dbc751f343 --- /dev/null +++ b/src/filter/filter.cpp @@ -0,0 +1,806 @@ +#include "filter.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include "../utils/log.hpp" +#include "../utils/settings.hpp" + +ndd::OperationResult<> Filter::load_schema() { + MDBX_txn* txn = nullptr; + int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to begin schema read transaction: " + + std::string(mdbx_strerror(rc))}; + } + + MDBX_val key{const_cast(SCHEMA_KEY), std::strlen(SCHEMA_KEY)}; + MDBX_val data; + rc = mdbx_get(txn, dbi_, &key, &data); + + if(rc == MDBX_NOTFOUND || (rc == MDBX_SUCCESS && data.iov_len == 0)) { + mdbx_txn_abort(txn); + return {SUCCESS, ""}; + } + if(rc != MDBX_SUCCESS) { + mdbx_txn_abort(txn); + return {100, "Failed to read filter schema: " + std::string(mdbx_strerror(rc))}; + } + + try { + std::string json_str(static_cast(data.iov_base), data.iov_len); + auto parsed = nlohmann::json::parse(json_str); + std::lock_guard lock(schema_mutex_); + schema_cache_.clear(); + for(auto& [field, stored_type] : parsed.items()) { + schema_cache_[field] = static_cast(stored_type.get()); + } + } catch(const std::exception& e) { + mdbx_txn_abort(txn); + return {200, "Failed to parse filter schema: " + std::string(e.what())}; + } + + mdbx_txn_abort(txn); + return {SUCCESS, ""}; +} + +ndd::OperationResult<> Filter::save_schema_internal() { + nlohmann::json schema_json; + for(const auto& [field, type] : schema_cache_) { + schema_json[field] = static_cast(type); + } + std::string json_str = schema_json.dump(); + + MDBX_txn* txn = nullptr; + int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to begin schema write transaction: " + + std::string(mdbx_strerror(rc))}; + } + + MDBX_val key{const_cast(SCHEMA_KEY), std::strlen(SCHEMA_KEY)}; + MDBX_val data{const_cast(json_str.c_str()), json_str.size()}; + + rc = mdbx_put(txn, dbi_, &key, &data, MDBX_UPSERT); + if(rc != MDBX_SUCCESS) { + mdbx_txn_abort(txn); + return {100, "Failed to persist filter schema: " + + std::string(mdbx_strerror(rc))}; + } + + rc = mdbx_txn_commit(txn); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to commit filter schema update: " + + std::string(mdbx_strerror(rc))}; + } + return {SUCCESS, ""}; +} + +ndd::OperationResult<> Filter::register_field_type(const std::string& field, FieldType type) { + std::lock_guard lock(schema_mutex_); + auto it = schema_cache_.find(field); + if(it != schema_cache_.end()) { + if(it->second == type) { + return {SUCCESS, ""}; + } + return {3, "Filter field '" + field + "' has a different existing type"}; + } + + schema_cache_[field] = type; + auto save_result = save_schema_internal(); + if(!save_result.ok()) { + schema_cache_.erase(field); + return save_result; + } + return {SUCCESS, ""}; +} + +ndd::OperationResult Filter::sortable_from_json(const nlohmann::json& value, + const std::string& context) { + if(value.is_number_integer()) { + return {SUCCESS, "", ndd::filter::int_to_sortable(value.get())}; + } + if(value.is_number()) { + return {SUCCESS, "", ndd::filter::float_to_sortable(value.get())}; + } + return {2, context + " must be a number"}; +} + +ndd::OperationResult +Filter::category_value_from_json(const nlohmann::json& value, const std::string& context) { + std::string str_val; + if(value.is_string()) { + str_val = value.get(); + } else if(value.is_boolean()) { + str_val = value.get() ? "1" : "0"; + } else if(value.is_number_integer()) { + str_val = std::to_string(value.get()); + } else { + return {2, context + " must be string, integer, or boolean"}; + } + + if(str_val.size() > 255) { + return {2, context + " is too long"}; + } + auto delim_check = validate_filter_key_component(str_val, context); + if(!delim_check.ok()) { + return {delim_check.code, delim_check.message}; + } + return {SUCCESS, "", std::move(str_val)}; +} + +ndd::OperationResult<> +Filter::validate_filter_key_component(const std::string& component, + const std::string& context) { + if(component.find(':') != std::string::npos) { + return {1, context + " must not contain ':'"}; + } + return {SUCCESS, ""}; +} + +std::string Filter::format_filter_key(const std::string& field, const std::string& value) { + return field + ":" + value; +} + +ndd::OperationResult> +Filter::numeric_bound_from_comparison(const std::string& op, const nlohmann::json& val) { + using Bound = std::pair; + constexpr uint32_t SORTABLE_MIN = 0x00000000u; + constexpr uint32_t SORTABLE_MAX = 0xFFFFFFFFu; + const Bound EMPTY{SORTABLE_MAX, SORTABLE_MIN}; + + if(!val.is_number()) { + return {2, op + " value must be a finite number"}; + } + if(!val.is_number_integer() && !std::isfinite(val.get())) { + return {2, op + " value must be a finite number"}; + } + + if(op == "$gte") { + auto sortable_result = sortable_from_json(val, op + " value"); + if(!sortable_result.ok()) { + return {sortable_result.code, sortable_result.message}; + } + return {SUCCESS, "", Bound{sortable_result.value_or_throw(), SORTABLE_MAX}}; + } + if(op == "$lte") { + auto sortable_result = sortable_from_json(val, op + " value"); + if(!sortable_result.ok()) { + return {sortable_result.code, sortable_result.message}; + } + return {SUCCESS, "", Bound{SORTABLE_MIN, sortable_result.value_or_throw()}}; + } + if(op == "$gt") { + if(val.is_number_integer()) { + int32_t x = val.get(); + if(x == std::numeric_limits::max()) { + return {SUCCESS, "", EMPTY}; + } + return {SUCCESS, "", Bound{ndd::filter::int_to_sortable(x + 1), SORTABLE_MAX}}; + } + float x = val.get(); + float next = std::nextafterf(x, std::numeric_limits::infinity()); + if(!std::isfinite(next)) { + return {SUCCESS, "", EMPTY}; + } + return {SUCCESS, "", Bound{ndd::filter::float_to_sortable(next), SORTABLE_MAX}}; + } + if(op == "$lt") { + if(val.is_number_integer()) { + int32_t x = val.get(); + if(x == std::numeric_limits::min()) { + return {SUCCESS, "", EMPTY}; + } + return {SUCCESS, "", Bound{SORTABLE_MIN, ndd::filter::int_to_sortable(x - 1)}}; + } + float x = val.get(); + float next = std::nextafterf(x, -std::numeric_limits::infinity()); + if(!std::isfinite(next)) { + return {SUCCESS, "", EMPTY}; + } + return {SUCCESS, "", Bound{SORTABLE_MIN, ndd::filter::float_to_sortable(next)}}; + } + + return {2, "Unsupported numeric comparison operator: " + op}; +} + +void Filter::init_environment() { + int rc = mdbx_env_create(&env_); + if(rc != MDBX_SUCCESS) { + throw std::runtime_error(std::string("Failed to create LMDB env for filters: ") + + mdbx_strerror(rc)); + } + + // max DBs to allow multiple databases (main + schema + numeric_forward + numeric_inverted) + rc = mdbx_env_set_maxdbs(env_, 10); + if(rc != MDBX_SUCCESS) { + throw std::runtime_error(std::string("Failed to configure max DBs for filters: ") + + mdbx_strerror(rc)); + } + + // Set geometry for auto-grow using the filter map size settings + rc = mdbx_env_set_geometry(env_, + -1, + 1ULL << settings::FILTER_MAP_SIZE_BITS, + 1ULL << settings::FILTER_MAP_SIZE_MAX_BITS, + 1ULL << settings::FILTER_MAP_SIZE_BITS, + -1, + -1); + if(rc != MDBX_SUCCESS) { + throw std::runtime_error(std::string("Failed to set geometry for filters: ") + + mdbx_strerror(rc)); + } + + rc = mdbx_env_open(env_, + path_.c_str(), + MDBX_WRITEMAP | MDBX_MAPASYNC | MDBX_NORDAHEAD, + 0664); + if(rc != MDBX_SUCCESS) { + throw std::runtime_error(std::string("Failed to open filter environment: ") + + mdbx_strerror(rc)); + } + + MDBX_txn* txn = nullptr; + rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); + if(rc != MDBX_SUCCESS) { + throw std::runtime_error(std::string("Failed to begin filter transaction: ") + + mdbx_strerror(rc)); + } + + rc = mdbx_dbi_open(txn, nullptr, MDBX_CREATE, &dbi_); + if(rc != MDBX_SUCCESS) { + mdbx_txn_abort(txn); + throw std::runtime_error(std::string("Failed to open filter database: ") + + mdbx_strerror(rc)); + } + + rc = mdbx_txn_commit(txn); + if(rc != MDBX_SUCCESS) { + throw std::runtime_error(std::string("Failed to commit filter transaction: ") + + mdbx_strerror(rc)); + } + + // Initialize Indices + numeric_index_ = std::make_unique(env_); + category_index_ = std::make_unique(env_); + + auto schema_result = load_schema(); + if(!schema_result.ok()) { + LOG_ERROR(1201, index_id_, schema_result.message); + throw std::runtime_error(schema_result.message); + } +} + +Filter::Filter(const std::string& path, const std::string& index_id) : + index_id_(index_id), + path_(path) { + std::filesystem::create_directories(path); + init_environment(); +} + +Filter::Filter(const std::string& path) : + Filter(path, "-/-") {} + +Filter::~Filter() { + mdbx_dbi_close(env_, dbi_); + mdbx_env_close(env_); +} + +ndd::OperationResult +Filter::computeFilterBitmap(const nlohmann::json& filter_array) const { + if(!filter_array.is_array()) { + return {1, "Filter must be an array"}; + } + + if(filter_array.empty()) { + return {SUCCESS, "", ndd::RoaringBitmap()}; + } + + std::vector partial_results; + partial_results.reserve(filter_array.size()); + + for(const auto& condition : filter_array) { + if(!condition.is_object() || condition.size() != 1) { + return {1, "Each filter condition must be a single-field object"}; + } + + const auto& field = condition.begin().key(); + const auto& expr = condition.begin().value(); + if(field.empty()) { + return {1, "Filter field name cannot be empty"}; + } + auto field_check = validate_filter_key_component(field, "Filter field name"); + if(!field_check.ok()) { + return {field_check.code, field_check.message}; + } + if(!expr.is_object() || expr.size() != 1) { + return {1, "Filter operator must be a single-field object"}; + } + + // Check schema for field type + FieldType type = FieldType::Unknown; + { + std::lock_guard lock(schema_mutex_); + auto it = schema_cache_.find(field); + if(it != schema_cache_.end()) { + type = it->second; + } + } + + const std::string op = expr.begin().key(); + const auto& val = expr.begin().value(); + ndd::RoaringBitmap or_result; + + if(op == "$eq") { + if(type == FieldType::Number) { + auto sortable_result = sortable_from_json(val, "$eq value for numeric field"); + if(!sortable_result.ok()) { + return {sortable_result.code, sortable_result.message}; + } + auto range_result = + numeric_index_->range(field, sortable_result.value_or_throw(), sortable_result.value_or_throw()); + if(!range_result.ok()) { + return {range_result.code, range_result.message}; + } + or_result = std::move(range_result.value_or_throw()); + } else { + auto value_result = category_value_from_json(val, "$eq value"); + if(!value_result.ok()) { + return {value_result.code, value_result.message}; + } + auto bitmap_result = category_index_->get_bitmap_by_key( + format_filter_key(field, value_result.value_or_throw())); + if(!bitmap_result.ok()) { + return {bitmap_result.code, bitmap_result.message}; + } + or_result = std::move(bitmap_result.value_or_throw()); + } + } else if(op == "$in") { + if(!val.is_array()) { + return {2, "$in must be an array"}; + } + + for(const auto& item : val) { + if(type == FieldType::Number) { + auto sortable_result = + sortable_from_json(item, "$in value for numeric field"); + if(!sortable_result.ok()) { + return {sortable_result.code, sortable_result.message}; + } + auto range_result = numeric_index_->range(field, + sortable_result.value_or_throw(), + sortable_result.value_or_throw()); + if(!range_result.ok()) { + return {range_result.code, range_result.message}; + } + or_result |= range_result.value_or_throw(); + } else { + auto value_result = category_value_from_json(item, "$in value"); + if(!value_result.ok()) { + return {value_result.code, value_result.message}; + } + if(!value_result.value_or_throw().empty()) { + auto bitmap_result = category_index_->get_bitmap_by_key( + format_filter_key(field, value_result.value_or_throw())); + if(!bitmap_result.ok()) { + return {bitmap_result.code, bitmap_result.message}; + } + or_result |= bitmap_result.value_or_throw(); + } + } + } + } else if(op == "$range") { + if(!val.is_array() || val.size() != 2) { + return {2, "$range must be [start, end] with exactly 2 values"}; + } + if(type != FieldType::Number) { + return {2, "$range operator is only supported for numeric fields"}; + } + + auto start_result = sortable_from_json(val[0], "Range start"); + if(!start_result.ok()) { + return {start_result.code, start_result.message}; + } + auto end_result = sortable_from_json(val[1], "Range end"); + if(!end_result.ok()) { + return {end_result.code, end_result.message}; + } + if(start_result.value_or_throw() > end_result.value_or_throw()) { + return {2, "Invalid range: start > end"}; + } + + auto range_result = + numeric_index_->range(field, start_result.value_or_throw(), end_result.value_or_throw()); + if(!range_result.ok()) { + return {range_result.code, range_result.message}; + } + or_result = std::move(range_result.value_or_throw()); + } else if(op == "$lt" || op == "$lte" || op == "$gt" || op == "$gte") { + if(type != FieldType::Number) { + return {2, op + " operator is only supported for numeric fields"}; + } + auto bound_result = numeric_bound_from_comparison(op, val); + if(!bound_result.ok()) { + return {bound_result.code, bound_result.message}; + } + auto [min_val, max_val] = bound_result.value_or_throw(); + if(min_val <= max_val) { + auto range_result = numeric_index_->range(field, min_val, max_val); + if(!range_result.ok()) { + return {range_result.code, range_result.message}; + } + or_result = std::move(range_result.value_or_throw()); + } + } else { + return {2, "Unsupported filter operator: " + op}; + } + + partial_results.push_back(std::move(or_result)); + } + + // Optimization: Sort by cardinality (smallest first) + std::sort(partial_results.begin(), + partial_results.end(), + [](const ndd::RoaringBitmap& left, const ndd::RoaringBitmap& right) { + return left.cardinality() < right.cardinality(); + }); + + if(partial_results.empty()) { + return {SUCCESS, "", ndd::RoaringBitmap()}; + } + + ndd::RoaringBitmap final_result = partial_results[0]; + for(size_t i = 1; i < partial_results.size(); ++i) { + final_result &= partial_results[i]; + + // If result becomes empty, stop early + if(final_result.isEmpty()) { + return {SUCCESS, "", std::move(final_result)}; + } + } + + return {SUCCESS, "", std::move(final_result)}; +} + +ndd::OperationResult> +Filter::getIdsMatchingFilter(const nlohmann::json& filter_array) const { + auto bitmap_result = computeFilterBitmap(filter_array); + if(!bitmap_result.ok()) { + return {bitmap_result.code, bitmap_result.message}; + } + + std::vector ids; + ids.reserve(bitmap_result.value_or_throw().cardinality()); + bitmap_result.value_or_throw().iterate( + [](ndd::idInt val, void* ptr) { + static_cast*>(ptr)->push_back(val); + return true; + }, + &ids); + return {SUCCESS, "", std::move(ids)}; +} + +ndd::OperationResult +Filter::countIdsMatchingFilter(const nlohmann::json& filter_array) const { + auto bitmap_result = computeFilterBitmap(filter_array); + if(!bitmap_result.ok()) { + return {bitmap_result.code, bitmap_result.message}; + } + return {SUCCESS, "", bitmap_result.value_or_throw().cardinality()}; +} + +ndd::OperationResult<> +Filter::add_to_filter(const std::string& field, const std::string& value, ndd::idInt numeric_id) { + auto field_check = validate_filter_key_component(field, "Filter field name"); + if(!field_check.ok()) { + return field_check; + } + auto value_check = validate_filter_key_component(value, "Filter value"); + if(!value_check.ok()) { + return value_check; + } + return category_index_->add(field, value, numeric_id); +} + +ndd::OperationResult<> +Filter::add_to_filter_batch(const std::string& filter_key, + const std::vector& numeric_ids) { + if(numeric_ids.empty()) { + return {SUCCESS, ""}; + } + return category_index_->add_batch_by_key(filter_key, numeric_ids); +} + +ndd::OperationResult<> Filter::add_filters_from_json_batch( + const std::vector>& id_filter_pairs) { + if(id_filter_pairs.empty()) { + return {SUCCESS, ""}; + } + + // Create a map to collect IDs for each label filter + std::unordered_map> label_filter_to_ids; + label_filter_to_ids.reserve(id_filter_pairs.size()); + std::vector numeric_filter_entries; + numeric_filter_entries.reserve(id_filter_pairs.size()); + + // Group IDs by filter + for(const auto& [numeric_id, filter_json] : id_filter_pairs) { + nlohmann::json parsed; + try { + parsed = nlohmann::json::parse(filter_json); + } catch(const std::exception& e) { + return {1, "Invalid filter JSON: " + std::string(e.what())}; + } + + if(!parsed.is_object()) { + return {1, "Filter JSON document must be an object"}; + } + + for(const auto& [field, value] : parsed.items()) { + if(field.empty()) { + return {1, "Filter field name cannot be empty"}; + } + auto field_check = validate_filter_key_component(field, "Filter field name"); + if(!field_check.ok()) { + return {field_check.code, field_check.message}; + } + + FieldType type = FieldType::Unknown; + if(value.is_boolean()) { + type = FieldType::Bool; + } else if(value.is_number()) { + type = FieldType::Number; + } else if(value.is_string()) { + type = FieldType::String; + } + + if(type == FieldType::Unknown) { + return {2, "Unsupported filter type for field '" + field + "'"}; + } + + auto register_result = register_field_type(field, type); + if(!register_result.ok()) { + return register_result; + } + + if(type == FieldType::String) { + auto category_result = category_value_from_json(value, "Filter value"); + if(!category_result.ok()) { + return {category_result.code, + category_result.message + " for field '" + field + "'"}; + } + label_filter_to_ids[format_filter_key(field, category_result.value_or_throw())] + .emplace_back(numeric_id); + } else if(type == FieldType::Bool) { + label_filter_to_ids[format_filter_key(field, value.get() ? "1" : "0")] + .emplace_back(numeric_id); + } else if(type == FieldType::Number) { + auto sortable_result = sortable_from_json(value, "Numeric filter value"); + if(!sortable_result.ok()) { + return {sortable_result.code, + sortable_result.message + " for field '" + field + "'"}; + } + numeric_filter_entries.emplace_back(field, numeric_id, sortable_result.value_or_throw()); + } + } + } + + /** + * XXX: For transactional correctness of filter adds, all the filters + * should be added in a single transaction. + * For now, they are being added in two different transactions. + * one for numeric_index and other for labels. + */ + + if(!numeric_filter_entries.empty()) { + auto numeric_result = numeric_index_->put_batch(numeric_filter_entries); + if(!numeric_result.ok()) { + return numeric_result; + } + } + + // Process each filter with its batch of IDs + for(const auto& [filter_key, ids] : label_filter_to_ids) { + auto add_result = add_to_filter_batch(filter_key, ids); + if(!add_result.ok()) { + return add_result; + } + } + + return {SUCCESS, ""}; +} + +ndd::OperationResult<> +Filter::remove_from_filter(const std::string& field, + const std::string& value, + ndd::idInt numeric_id) { + auto field_check = validate_filter_key_component(field, "Filter field name"); + if(!field_check.ok()) { + return field_check; + } + auto value_check = validate_filter_key_component(value, "Filter value"); + if(!value_check.ok()) { + return value_check; + } + return category_index_->remove(field, value, numeric_id); +} + +ndd::OperationResult +Filter::contains(const std::string& field, + const std::string& value, + ndd::idInt numeric_id) const { + auto field_check = validate_filter_key_component(field, "Filter field name"); + if(!field_check.ok()) { + return {field_check.code, field_check.message}; + } + auto value_check = validate_filter_key_component(value, "Filter value"); + if(!value_check.ok()) { + return {value_check.code, value_check.message}; + } + return category_index_->contains(field, value, numeric_id); +} + +ndd::OperationResult<> Filter::add_filters_from_json(ndd::idInt numeric_id, + const std::string& filter_json) { + return add_filters_from_json_batch({{numeric_id, filter_json}}); +} + +ndd::OperationResult<> Filter::remove_filters_from_json(ndd::idInt numeric_id, + const std::string& filter_json) { + nlohmann::json parsed; + try { + parsed = nlohmann::json::parse(filter_json); + } catch(const std::exception& e) { + return {1, "Invalid filter JSON while removing filters: " + std::string(e.what())}; + } + + if(!parsed.is_object()) { + return {1, "Filter JSON document must be an object"}; + } + + for(const auto& [field, value] : parsed.items()) { + if(field.empty()) { + return {1, "Filter field name cannot be empty"}; + } + auto field_check = validate_filter_key_component(field, "Filter field name"); + if(!field_check.ok()) { + return {field_check.code, field_check.message}; + } + + ndd::OperationResult<> remove_result{SUCCESS, ""}; + if(value.is_string()) { + auto category_result = category_value_from_json(value, "Filter value"); + if(!category_result.ok()) { + return {category_result.code, + category_result.message + " for field '" + field + "'"}; + } + remove_result = remove_from_filter(field, category_result.value_or_throw(), numeric_id); + } else if(value.is_number()) { + // Remove from Numeric Index + remove_result = numeric_index_->remove(field, numeric_id); + } else if(value.is_boolean()) { + remove_result = remove_from_filter(field, + value.get() ? "1" : "0", + numeric_id); + } else { + return {2, "Unsupported filter type for field '" + field + "'"}; + } + + if(!remove_result.ok()) { + return remove_result; + } + } + + return {SUCCESS, ""}; +} + +ndd::OperationResult Filter::combine_filters_and( + const std::vector>& filters) const { + ndd::RoaringBitmap result; + bool first = true; + for(const auto& [field, value] : filters) { + auto bitmap_result = category_index_->get_bitmap(field, value); + if(!bitmap_result.ok()) { + return {bitmap_result.code, bitmap_result.message}; + } + if(first) { + result = std::move(bitmap_result.value_or_throw()); + first = false; + } else { + result &= bitmap_result.value_or_throw(); + } + } + return {SUCCESS, "", std::move(result)}; +} + +ndd::OperationResult Filter::combine_filters_or( + const std::vector>& filters) const { + ndd::RoaringBitmap result; + for(const auto& [field, value] : filters) { + auto bitmap_result = category_index_->get_bitmap(field, value); + if(!bitmap_result.ok()) { + return {bitmap_result.code, bitmap_result.message}; + } + result |= bitmap_result.value_or_throw(); + } + return {SUCCESS, "", std::move(result)}; +} + +ndd::OperationResult Filter::check_numeric(const std::string& field, + ndd::idInt id, + const std::string& op, + const nlohmann::json& val) const { + if(op == "$eq") { + auto sortable_result = sortable_from_json(val, "$eq value for numeric field"); + if(!sortable_result.ok()) { + return {sortable_result.code, sortable_result.message}; + } + return numeric_index_->check_range(field, + id, + sortable_result.value_or_throw(), + sortable_result.value_or_throw()); + } + + if(op == "$in") { + if(!val.is_array()) { + return {2, "$in must be an array"}; + } + for(const auto& item : val) { + auto sortable_result = sortable_from_json(item, "$in value for numeric field"); + if(!sortable_result.ok()) { + return {sortable_result.code, sortable_result.message}; + } + + auto check_result = numeric_index_->check_range(field, + id, + sortable_result.value_or_throw(), + sortable_result.value_or_throw()); + if(!check_result.ok()) { + return check_result; + } + if(check_result.value_or_throw()) { + return {SUCCESS, "", true}; + } + } + return {SUCCESS, "", false}; + } + + if(op == "$range") { + if(!val.is_array() || val.size() != 2) { + return {2, "$range must be [start, end] with exactly 2 values"}; + } + + auto start_result = sortable_from_json(val[0], "Range start"); + if(!start_result.ok()) { + return {start_result.code, start_result.message}; + } + auto end_result = sortable_from_json(val[1], "Range end"); + if(!end_result.ok()) { + return {end_result.code, end_result.message}; + } + if(start_result.value_or_throw() > end_result.value_or_throw()) { + return {2, "Invalid range: start > end"}; + } + + return numeric_index_->check_range(field, id, start_result.value_or_throw(), end_result.value_or_throw()); + } + + if(op == "$lt" || op == "$lte" || op == "$gt" || op == "$gte") { + auto bound_result = numeric_bound_from_comparison(op, val); + if(!bound_result.ok()) { + return {bound_result.code, bound_result.message}; + } + auto [min_val, max_val] = bound_result.value_or_throw(); + if(min_val > max_val) { + return {SUCCESS, "", false}; + } + return numeric_index_->check_range(field, id, min_val, max_val); + } + + return {2, "Unsupported numeric operator: " + op}; +} diff --git a/src/filter/filter.hpp b/src/filter/filter.hpp index 74a7d95761..0038f1b64d 100644 --- a/src/filter/filter.hpp +++ b/src/filter/filter.hpp @@ -1,15 +1,8 @@ #pragma once -// System includes -#include -#include #include -#include -#include -#include #include #include -#include #include #include #include @@ -19,8 +12,6 @@ #include "mdbx/mdbx.h" #include "../core/types.hpp" #include "../hnsw/hnswlib.h" -#include "../utils/log.hpp" -#include "../utils/settings.hpp" #include "../utils/types.hpp" #include "category_index.hpp" @@ -66,43 +57,7 @@ class Filter { * 100 = MDBX transaction or read failure; caller should log ERROR and return HTTP 500 * 200 = corrupt schema JSON payload; caller should log ERROR and return HTTP 500 */ - ndd::OperationResult<> load_schema() { - MDBX_txn* txn = nullptr; - int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn); - if(rc != MDBX_SUCCESS) { - return {100, "Failed to begin schema read transaction: " - + std::string(mdbx_strerror(rc))}; - } - - MDBX_val key{const_cast(SCHEMA_KEY), std::strlen(SCHEMA_KEY)}; - MDBX_val data; - rc = mdbx_get(txn, dbi_, &key, &data); - - if(rc == MDBX_NOTFOUND || (rc == MDBX_SUCCESS && data.iov_len == 0)) { - mdbx_txn_abort(txn); - return {SUCCESS, ""}; - } - if(rc != MDBX_SUCCESS) { - mdbx_txn_abort(txn); - return {100, "Failed to read filter schema: " + std::string(mdbx_strerror(rc))}; - } - - try { - std::string json_str(static_cast(data.iov_base), data.iov_len); - auto parsed = nlohmann::json::parse(json_str); - std::lock_guard lock(schema_mutex_); - schema_cache_.clear(); - for(auto& [field, stored_type] : parsed.items()) { - schema_cache_[field] = static_cast(stored_type.get()); - } - } catch(const std::exception& e) { - mdbx_txn_abort(txn); - return {200, "Failed to parse filter schema: " + std::string(e.what())}; - } - - mdbx_txn_abort(txn); - return {SUCCESS, ""}; - } + ndd::OperationResult<> load_schema(); /* * Persists the current in-memory filter schema cache. @@ -111,37 +66,7 @@ class Filter { * 0 = success * 100 = MDBX transaction, write, or commit failure; caller should log ERROR and return HTTP 500 */ - ndd::OperationResult<> save_schema_internal() { - nlohmann::json schema_json; - for(const auto& [field, type] : schema_cache_) { - schema_json[field] = static_cast(type); - } - std::string json_str = schema_json.dump(); - - MDBX_txn* txn = nullptr; - int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); - if(rc != MDBX_SUCCESS) { - return {100, "Failed to begin schema write transaction: " - + std::string(mdbx_strerror(rc))}; - } - - MDBX_val key{const_cast(SCHEMA_KEY), std::strlen(SCHEMA_KEY)}; - MDBX_val data{const_cast(json_str.c_str()), json_str.size()}; - - rc = mdbx_put(txn, dbi_, &key, &data, MDBX_UPSERT); - if(rc != MDBX_SUCCESS) { - mdbx_txn_abort(txn); - return {100, "Failed to persist filter schema: " - + std::string(mdbx_strerror(rc))}; - } - - rc = mdbx_txn_commit(txn); - if(rc != MDBX_SUCCESS) { - return {100, "Failed to commit filter schema update: " - + std::string(mdbx_strerror(rc))}; - } - return {SUCCESS, ""}; - } + ndd::OperationResult<> save_schema_internal(); /* * Registers a field type in the filter schema if it is not already present. @@ -151,24 +76,7 @@ class Filter { * 3 = field type mismatch with existing schema; caller should return HTTP 400 * 100-199 = propagated MDBX/storage failure from schema persistence */ - ndd::OperationResult<> register_field_type(const std::string& field, FieldType type) { - std::lock_guard lock(schema_mutex_); - auto it = schema_cache_.find(field); - if(it != schema_cache_.end()) { - if(it->second == type) { - return {SUCCESS, ""}; - } - return {3, "Filter field '" + field + "' has a different existing type"}; - } - - schema_cache_[field] = type; - auto save_result = save_schema_internal(); - if(!save_result.ok()) { - schema_cache_.erase(field); - return save_result; - } - return {SUCCESS, ""}; - } + ndd::OperationResult<> register_field_type(const std::string& field, FieldType type); /* * Converts a JSON number into the current sortable numeric filter encoding. @@ -178,15 +86,7 @@ class Filter { * 2 = value is not numeric; caller should return HTTP 400 */ static ndd::OperationResult sortable_from_json(const nlohmann::json& value, - const std::string& context) { - if(value.is_number_integer()) { - return {SUCCESS, "", ndd::filter::int_to_sortable(value.get())}; - } - if(value.is_number()) { - return {SUCCESS, "", ndd::filter::float_to_sortable(value.get())}; - } - return {2, context + " must be a number"}; - } + const std::string& context); /* * Converts a JSON scalar into the category key value representation. @@ -196,27 +96,7 @@ class Filter { * 2 = value is not a supported category scalar or is too long; caller should return HTTP 400 */ static ndd::OperationResult category_value_from_json(const nlohmann::json& value, - const std::string& context) { - std::string str_val; - if(value.is_string()) { - str_val = value.get(); - } else if(value.is_boolean()) { - str_val = value.get() ? "1" : "0"; - } else if(value.is_number_integer()) { - str_val = std::to_string(value.get()); - } else { - return {2, context + " must be string, integer, or boolean"}; - } - - if(str_val.size() > 255) { - return {2, context + " is too long"}; - } - auto delim_check = validate_filter_key_component(str_val, context); - if(!delim_check.ok()) { - return {delim_check.code, delim_check.message}; - } - return {SUCCESS, "", std::move(str_val)}; - } + const std::string& context); // Rejects ':' because it is the MDBX key delimiter for category and numeric // indexes (see format_filter_key, NumericIndex::make_*_key). Allowing ':' in @@ -224,16 +104,9 @@ class Filter { // pairs. static ndd::OperationResult<> validate_filter_key_component(const std::string& component, - const std::string& context) { - if(component.find(':') != std::string::npos) { - return {1, context + " must not contain ':'"}; - } - return {SUCCESS, ""}; - } + const std::string& context); - static std::string format_filter_key(const std::string& field, const std::string& value) { - return field + ":" + value; - } + static std::string format_filter_key(const std::string& field, const std::string& value); /* * Resolves [$lt | $lte | $gt | $gte] on a JSON numeric value into a @@ -247,149 +120,16 @@ class Filter { * caller should return HTTP 400 */ static ndd::OperationResult> - numeric_bound_from_comparison(const std::string& op, const nlohmann::json& val) { - using Bound = std::pair; - constexpr uint32_t SORTABLE_MIN = 0x00000000u; - constexpr uint32_t SORTABLE_MAX = 0xFFFFFFFFu; - const Bound EMPTY{SORTABLE_MAX, SORTABLE_MIN}; - - if(!val.is_number()) { - return {2, op + " value must be a finite number"}; - } - if(!val.is_number_integer() && !std::isfinite(val.get())) { - return {2, op + " value must be a finite number"}; - } - - if(op == "$gte") { - auto sortable_result = sortable_from_json(val, op + " value"); - if(!sortable_result.ok()) { - return {sortable_result.code, sortable_result.message}; - } - return {SUCCESS, "", Bound{sortable_result.value_or_throw(), SORTABLE_MAX}}; - } - if(op == "$lte") { - auto sortable_result = sortable_from_json(val, op + " value"); - if(!sortable_result.ok()) { - return {sortable_result.code, sortable_result.message}; - } - return {SUCCESS, "", Bound{SORTABLE_MIN, sortable_result.value_or_throw()}}; - } - if(op == "$gt") { - if(val.is_number_integer()) { - int32_t x = val.get(); - if(x == std::numeric_limits::max()) { - return {SUCCESS, "", EMPTY}; - } - return {SUCCESS, "", Bound{ndd::filter::int_to_sortable(x + 1), SORTABLE_MAX}}; - } - float x = val.get(); - float next = std::nextafterf(x, std::numeric_limits::infinity()); - if(!std::isfinite(next)) { - return {SUCCESS, "", EMPTY}; - } - return {SUCCESS, "", Bound{ndd::filter::float_to_sortable(next), SORTABLE_MAX}}; - } - if(op == "$lt") { - if(val.is_number_integer()) { - int32_t x = val.get(); - if(x == std::numeric_limits::min()) { - return {SUCCESS, "", EMPTY}; - } - return {SUCCESS, "", Bound{SORTABLE_MIN, ndd::filter::int_to_sortable(x - 1)}}; - } - float x = val.get(); - float next = std::nextafterf(x, -std::numeric_limits::infinity()); - if(!std::isfinite(next)) { - return {SUCCESS, "", EMPTY}; - } - return {SUCCESS, "", Bound{SORTABLE_MIN, ndd::filter::float_to_sortable(next)}}; - } - - return {2, "Unsupported numeric comparison operator: " + op}; - } - - void init_environment() { - int rc = mdbx_env_create(&env_); - if(rc != MDBX_SUCCESS) { - throw std::runtime_error(std::string("Failed to create LMDB env for filters: ") - + mdbx_strerror(rc)); - } - - // max DBs to allow multiple databases (main + schema + numeric_forward + numeric_inverted) - rc = mdbx_env_set_maxdbs(env_, 10); - if(rc != MDBX_SUCCESS) { - throw std::runtime_error(std::string("Failed to configure max DBs for filters: ") - + mdbx_strerror(rc)); - } - - // Set geometry for auto-grow using the filter map size settings - rc = mdbx_env_set_geometry(env_, - -1, - 1ULL << settings::FILTER_MAP_SIZE_BITS, - 1ULL << settings::FILTER_MAP_SIZE_MAX_BITS, - 1ULL << settings::FILTER_MAP_SIZE_BITS, - -1, - -1); - if(rc != MDBX_SUCCESS) { - throw std::runtime_error(std::string("Failed to set geometry for filters: ") - + mdbx_strerror(rc)); - } - - rc = mdbx_env_open(env_, - path_.c_str(), - MDBX_WRITEMAP | MDBX_MAPASYNC | MDBX_NORDAHEAD, - 0664); - if(rc != MDBX_SUCCESS) { - throw std::runtime_error(std::string("Failed to open filter environment: ") - + mdbx_strerror(rc)); - } - - MDBX_txn* txn = nullptr; - rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); - if(rc != MDBX_SUCCESS) { - throw std::runtime_error(std::string("Failed to begin filter transaction: ") - + mdbx_strerror(rc)); - } - - rc = mdbx_dbi_open(txn, nullptr, MDBX_CREATE, &dbi_); - if(rc != MDBX_SUCCESS) { - mdbx_txn_abort(txn); - throw std::runtime_error(std::string("Failed to open filter database: ") - + mdbx_strerror(rc)); - } - - rc = mdbx_txn_commit(txn); - if(rc != MDBX_SUCCESS) { - throw std::runtime_error(std::string("Failed to commit filter transaction: ") - + mdbx_strerror(rc)); - } - - // Initialize Indices - numeric_index_ = std::make_unique(env_); - category_index_ = std::make_unique(env_); - - auto schema_result = load_schema(); - if(!schema_result.ok()) { - LOG_ERROR(1201, index_id_, schema_result.message); - throw std::runtime_error(schema_result.message); - } - } + numeric_bound_from_comparison(const std::string& op, const nlohmann::json& val); + + void init_environment(); public: - Filter(const std::string& path, const std::string& index_id) : - index_id_(index_id), - path_(path) { - std::filesystem::create_directories(path); - init_environment(); - } + Filter(const std::string& path, const std::string& index_id); - Filter(const std::string& path) : - Filter(path, "-/-") {} + Filter(const std::string& path); - ~Filter() { - mdbx_dbi_close(env_, dbi_); - mdbx_env_close(env_); - } + ~Filter(); /* * Computes the bitmap for an AND filter query. @@ -402,180 +142,7 @@ class Filter { * 200-299 = propagated corruption/invariant failure from category or numeric index */ ndd::OperationResult - computeFilterBitmap(const nlohmann::json& filter_array) const { - if(!filter_array.is_array()) { - return {1, "Filter must be an array"}; - } - - if(filter_array.empty()) { - return {SUCCESS, "", ndd::RoaringBitmap()}; - } - - std::vector partial_results; - partial_results.reserve(filter_array.size()); - - for(const auto& condition : filter_array) { - if(!condition.is_object() || condition.size() != 1) { - return {1, "Each filter condition must be a single-field object"}; - } - - const auto& field = condition.begin().key(); - const auto& expr = condition.begin().value(); - if(field.empty()) { - return {1, "Filter field name cannot be empty"}; - } - auto field_check = validate_filter_key_component(field, "Filter field name"); - if(!field_check.ok()) { - return {field_check.code, field_check.message}; - } - if(!expr.is_object() || expr.size() != 1) { - return {1, "Filter operator must be a single-field object"}; - } - - // Check schema for field type - FieldType type = FieldType::Unknown; - { - std::lock_guard lock(schema_mutex_); - auto it = schema_cache_.find(field); - if(it != schema_cache_.end()) { - type = it->second; - } - } - - const std::string op = expr.begin().key(); - const auto& val = expr.begin().value(); - ndd::RoaringBitmap or_result; - - if(op == "$eq") { - if(type == FieldType::Number) { - auto sortable_result = sortable_from_json(val, "$eq value for numeric field"); - if(!sortable_result.ok()) { - return {sortable_result.code, sortable_result.message}; - } - auto range_result = - numeric_index_->range(field, sortable_result.value_or_throw(), sortable_result.value_or_throw()); - if(!range_result.ok()) { - return {range_result.code, range_result.message}; - } - or_result = std::move(range_result.value_or_throw()); - } else { - auto value_result = category_value_from_json(val, "$eq value"); - if(!value_result.ok()) { - return {value_result.code, value_result.message}; - } - auto bitmap_result = category_index_->get_bitmap_by_key( - format_filter_key(field, value_result.value_or_throw())); - if(!bitmap_result.ok()) { - return {bitmap_result.code, bitmap_result.message}; - } - or_result = std::move(bitmap_result.value_or_throw()); - } - } else if(op == "$in") { - if(!val.is_array()) { - return {2, "$in must be an array"}; - } - - for(const auto& item : val) { - if(type == FieldType::Number) { - auto sortable_result = - sortable_from_json(item, "$in value for numeric field"); - if(!sortable_result.ok()) { - return {sortable_result.code, sortable_result.message}; - } - auto range_result = numeric_index_->range(field, - sortable_result.value_or_throw(), - sortable_result.value_or_throw()); - if(!range_result.ok()) { - return {range_result.code, range_result.message}; - } - or_result |= range_result.value_or_throw(); - } else { - auto value_result = category_value_from_json(item, "$in value"); - if(!value_result.ok()) { - return {value_result.code, value_result.message}; - } - if(!value_result.value_or_throw().empty()) { - auto bitmap_result = category_index_->get_bitmap_by_key( - format_filter_key(field, value_result.value_or_throw())); - if(!bitmap_result.ok()) { - return {bitmap_result.code, bitmap_result.message}; - } - or_result |= bitmap_result.value_or_throw(); - } - } - } - } else if(op == "$range") { - if(!val.is_array() || val.size() != 2) { - return {2, "$range must be [start, end] with exactly 2 values"}; - } - if(type != FieldType::Number) { - return {2, "$range operator is only supported for numeric fields"}; - } - - auto start_result = sortable_from_json(val[0], "Range start"); - if(!start_result.ok()) { - return {start_result.code, start_result.message}; - } - auto end_result = sortable_from_json(val[1], "Range end"); - if(!end_result.ok()) { - return {end_result.code, end_result.message}; - } - if(start_result.value_or_throw() > end_result.value_or_throw()) { - return {2, "Invalid range: start > end"}; - } - - auto range_result = - numeric_index_->range(field, start_result.value_or_throw(), end_result.value_or_throw()); - if(!range_result.ok()) { - return {range_result.code, range_result.message}; - } - or_result = std::move(range_result.value_or_throw()); - } else if(op == "$lt" || op == "$lte" || op == "$gt" || op == "$gte") { - if(type != FieldType::Number) { - return {2, op + " operator is only supported for numeric fields"}; - } - auto bound_result = numeric_bound_from_comparison(op, val); - if(!bound_result.ok()) { - return {bound_result.code, bound_result.message}; - } - auto [min_val, max_val] = bound_result.value_or_throw(); - if(min_val <= max_val) { - auto range_result = numeric_index_->range(field, min_val, max_val); - if(!range_result.ok()) { - return {range_result.code, range_result.message}; - } - or_result = std::move(range_result.value_or_throw()); - } - } else { - return {2, "Unsupported filter operator: " + op}; - } - - partial_results.push_back(std::move(or_result)); - } - - // Optimization: Sort by cardinality (smallest first) - std::sort(partial_results.begin(), - partial_results.end(), - [](const ndd::RoaringBitmap& left, const ndd::RoaringBitmap& right) { - return left.cardinality() < right.cardinality(); - }); - - if(partial_results.empty()) { - return {SUCCESS, "", ndd::RoaringBitmap()}; - } - - ndd::RoaringBitmap final_result = partial_results[0]; - for(size_t i = 1; i < partial_results.size(); ++i) { - final_result &= partial_results[i]; - - // If result becomes empty, stop early - if(final_result.isEmpty()) { - return {SUCCESS, "", std::move(final_result)}; - } - } - - return {SUCCESS, "", std::move(final_result)}; - } + computeFilterBitmap(const nlohmann::json& filter_array) const; /** * Returns numeric ids matching a filter query based on the provided JSON filter array @@ -587,22 +154,7 @@ class Filter { * 200-299 = propagated corruption/invariant failure from bitmap computation */ ndd::OperationResult> - getIdsMatchingFilter(const nlohmann::json& filter_array) const { - auto bitmap_result = computeFilterBitmap(filter_array); - if(!bitmap_result.ok()) { - return {bitmap_result.code, bitmap_result.message}; - } - - std::vector ids; - ids.reserve(bitmap_result.value_or_throw().cardinality()); - bitmap_result.value_or_throw().iterate( - [](ndd::idInt val, void* ptr) { - static_cast*>(ptr)->push_back(val); - return true; - }, - &ids); - return {SUCCESS, "", std::move(ids)}; - } + getIdsMatchingFilter(const nlohmann::json& filter_array) const; /* * Counts numeric ids matching a filter query. @@ -613,13 +165,7 @@ class Filter { * 100-199 = propagated MDBX/storage failure from bitmap computation * 200-299 = propagated corruption/invariant failure from bitmap computation */ - ndd::OperationResult countIdsMatchingFilter(const nlohmann::json& filter_array) const { - auto bitmap_result = computeFilterBitmap(filter_array); - if(!bitmap_result.ok()) { - return {bitmap_result.code, bitmap_result.message}; - } - return {SUCCESS, "", bitmap_result.value_or_throw().cardinality()}; - } + ndd::OperationResult countIdsMatchingFilter(const nlohmann::json& filter_array) const; /* * Adds one id to a category filter. @@ -630,17 +176,7 @@ class Filter { * 200-299 = propagated corruption/invariant failure from category index */ ndd::OperationResult<> - add_to_filter(const std::string& field, const std::string& value, ndd::idInt numeric_id) { - auto field_check = validate_filter_key_component(field, "Filter field name"); - if(!field_check.ok()) { - return field_check; - } - auto value_check = validate_filter_key_component(value, "Filter value"); - if(!value_check.ok()) { - return value_check; - } - return category_index_->add(field, value, numeric_id); - } + add_to_filter(const std::string& field, const std::string& value, ndd::idInt numeric_id); /* * Adds a batch of ids to one already formatted category filter key. @@ -651,12 +187,7 @@ class Filter { * 200-299 = propagated corruption/invariant failure from category index */ ndd::OperationResult<> add_to_filter_batch(const std::string& filter_key, - const std::vector& numeric_ids) { - if(numeric_ids.empty()) { - return {SUCCESS, ""}; - } - return category_index_->add_batch_by_key(filter_key, numeric_ids); - } + const std::vector& numeric_ids); /* * Adds one batch of filter JSON documents into the numeric and category indexes. @@ -670,103 +201,7 @@ class Filter { * 200-299 = propagated corruption/invariant failure from numeric or category writes */ ndd::OperationResult<> add_filters_from_json_batch( - const std::vector>& id_filter_pairs) { - if(id_filter_pairs.empty()) { - return {SUCCESS, ""}; - } - - // Create a map to collect IDs for each label filter - std::unordered_map> label_filter_to_ids; - label_filter_to_ids.reserve(id_filter_pairs.size()); - std::vector numeric_filter_entries; - numeric_filter_entries.reserve(id_filter_pairs.size()); - - // Group IDs by filter - for(const auto& [numeric_id, filter_json] : id_filter_pairs) { - nlohmann::json parsed; - try { - parsed = nlohmann::json::parse(filter_json); - } catch(const std::exception& e) { - return {1, "Invalid filter JSON: " + std::string(e.what())}; - } - - if(!parsed.is_object()) { - return {1, "Filter JSON document must be an object"}; - } - - for(const auto& [field, value] : parsed.items()) { - if(field.empty()) { - return {1, "Filter field name cannot be empty"}; - } - auto field_check = validate_filter_key_component(field, "Filter field name"); - if(!field_check.ok()) { - return {field_check.code, field_check.message}; - } - - FieldType type = FieldType::Unknown; - if(value.is_boolean()) { - type = FieldType::Bool; - } else if(value.is_number()) { - type = FieldType::Number; - } else if(value.is_string()) { - type = FieldType::String; - } - - if(type == FieldType::Unknown) { - return {2, "Unsupported filter type for field '" + field + "'"}; - } - - auto register_result = register_field_type(field, type); - if(!register_result.ok()) { - return register_result; - } - - if(type == FieldType::String) { - auto category_result = category_value_from_json(value, "Filter value"); - if(!category_result.ok()) { - return {category_result.code, - category_result.message + " for field '" + field + "'"}; - } - label_filter_to_ids[format_filter_key(field, category_result.value_or_throw())] - .emplace_back(numeric_id); - } else if(type == FieldType::Bool) { - label_filter_to_ids[format_filter_key(field, value.get() ? "1" : "0")] - .emplace_back(numeric_id); - } else if(type == FieldType::Number) { - auto sortable_result = sortable_from_json(value, "Numeric filter value"); - if(!sortable_result.ok()) { - return {sortable_result.code, - sortable_result.message + " for field '" + field + "'"}; - } - numeric_filter_entries.emplace_back(field, numeric_id, sortable_result.value_or_throw()); - } - } - } - - /** - * XXX: For transactional correctness of filter adds, all the filters - * should be added in a single transaction. - * For now, they are being added in two different transactions. - * one for numeric_index and other for labels. - */ - - if(!numeric_filter_entries.empty()) { - auto numeric_result = numeric_index_->put_batch(numeric_filter_entries); - if(!numeric_result.ok()) { - return numeric_result; - } - } - - // Process each filter with its batch of IDs - for(const auto& [filter_key, ids] : label_filter_to_ids) { - auto add_result = add_to_filter_batch(filter_key, ids); - if(!add_result.ok()) { - return add_result; - } - } - - return {SUCCESS, ""}; - } + const std::vector>& id_filter_pairs); /* * Removes one id from a category filter. @@ -779,17 +214,7 @@ class Filter { ndd::OperationResult<> remove_from_filter(const std::string& field, const std::string& value, - ndd::idInt numeric_id) { - auto field_check = validate_filter_key_component(field, "Filter field name"); - if(!field_check.ok()) { - return field_check; - } - auto value_check = validate_filter_key_component(value, "Filter value"); - if(!value_check.ok()) { - return value_check; - } - return category_index_->remove(field, value, numeric_id); - } + ndd::idInt numeric_id); /* * Checks whether one id is present in a category filter. @@ -800,17 +225,7 @@ class Filter { * 200-299 = propagated corruption/invariant failure from category index */ ndd::OperationResult - contains(const std::string& field, const std::string& value, ndd::idInt numeric_id) const { - auto field_check = validate_filter_key_component(field, "Filter field name"); - if(!field_check.ok()) { - return {field_check.code, field_check.message}; - } - auto value_check = validate_filter_key_component(value, "Filter value"); - if(!value_check.ok()) { - return {value_check.code, value_check.message}; - } - return category_index_->contains(field, value, numeric_id); - } + contains(const std::string& field, const std::string& value, ndd::idInt numeric_id) const; /* * Adds one filter JSON document into the numeric and category indexes. @@ -822,9 +237,7 @@ class Filter { * 200-299 = propagated corruption/invariant failure from batch add */ ndd::OperationResult<> add_filters_from_json(ndd::idInt numeric_id, - const std::string& filter_json) { - return add_filters_from_json_batch({{numeric_id, filter_json}}); - } + const std::string& filter_json); /* * Removes one filter JSON document from the numeric and category indexes. @@ -837,53 +250,7 @@ class Filter { * 200-299 = propagated corruption/invariant failure from numeric or category index */ ndd::OperationResult<> remove_filters_from_json(ndd::idInt numeric_id, - const std::string& filter_json) { - nlohmann::json parsed; - try { - parsed = nlohmann::json::parse(filter_json); - } catch(const std::exception& e) { - return {1, "Invalid filter JSON while removing filters: " + std::string(e.what())}; - } - - if(!parsed.is_object()) { - return {1, "Filter JSON document must be an object"}; - } - - for(const auto& [field, value] : parsed.items()) { - if(field.empty()) { - return {1, "Filter field name cannot be empty"}; - } - auto field_check = validate_filter_key_component(field, "Filter field name"); - if(!field_check.ok()) { - return {field_check.code, field_check.message}; - } - - ndd::OperationResult<> remove_result{SUCCESS, ""}; - if(value.is_string()) { - auto category_result = category_value_from_json(value, "Filter value"); - if(!category_result.ok()) { - return {category_result.code, - category_result.message + " for field '" + field + "'"}; - } - remove_result = remove_from_filter(field, category_result.value_or_throw(), numeric_id); - } else if(value.is_number()) { - // Remove from Numeric Index - remove_result = numeric_index_->remove(field, numeric_id); - } else if(value.is_boolean()) { - remove_result = remove_from_filter(field, - value.get() ? "1" : "0", - numeric_id); - } else { - return {2, "Unsupported filter type for field '" + field + "'"}; - } - - if(!remove_result.ok()) { - return remove_result; - } - } - - return {SUCCESS, ""}; - } + const std::string& filter_json); /* * Combines category filters with AND semantics. @@ -894,23 +261,7 @@ class Filter { * 200-299 = propagated corruption/invariant failure from category index */ ndd::OperationResult combine_filters_and( - const std::vector>& filters) const { - ndd::RoaringBitmap result; - bool first = true; - for(const auto& [field, value] : filters) { - auto bitmap_result = category_index_->get_bitmap(field, value); - if(!bitmap_result.ok()) { - return {bitmap_result.code, bitmap_result.message}; - } - if(first) { - result = std::move(bitmap_result.value_or_throw()); - first = false; - } else { - result &= bitmap_result.value_or_throw(); - } - } - return {SUCCESS, "", std::move(result)}; - } + const std::vector>& filters) const; /* * Combines category filters with OR semantics. @@ -921,17 +272,7 @@ class Filter { * 200-299 = propagated corruption/invariant failure from category index */ ndd::OperationResult combine_filters_or( - const std::vector>& filters) const { - ndd::RoaringBitmap result; - for(const auto& [field, value] : filters) { - auto bitmap_result = category_index_->get_bitmap(field, value); - if(!bitmap_result.ok()) { - return {bitmap_result.code, bitmap_result.message}; - } - result |= bitmap_result.value_or_throw(); - } - return {SUCCESS, "", std::move(result)}; - } + const std::vector>& filters) const; /* * Checks whether one id satisfies one numeric filter expression. @@ -945,74 +286,5 @@ class Filter { ndd::OperationResult check_numeric(const std::string& field, ndd::idInt id, const std::string& op, - const nlohmann::json& val) const { - if(op == "$eq") { - auto sortable_result = sortable_from_json(val, "$eq value for numeric field"); - if(!sortable_result.ok()) { - return {sortable_result.code, sortable_result.message}; - } - return numeric_index_->check_range(field, - id, - sortable_result.value_or_throw(), - sortable_result.value_or_throw()); - } - - if(op == "$in") { - if(!val.is_array()) { - return {2, "$in must be an array"}; - } - for(const auto& item : val) { - auto sortable_result = sortable_from_json(item, "$in value for numeric field"); - if(!sortable_result.ok()) { - return {sortable_result.code, sortable_result.message}; - } - - auto check_result = numeric_index_->check_range(field, - id, - sortable_result.value_or_throw(), - sortable_result.value_or_throw()); - if(!check_result.ok()) { - return check_result; - } - if(check_result.value_or_throw()) { - return {SUCCESS, "", true}; - } - } - return {SUCCESS, "", false}; - } - - if(op == "$range") { - if(!val.is_array() || val.size() != 2) { - return {2, "$range must be [start, end] with exactly 2 values"}; - } - - auto start_result = sortable_from_json(val[0], "Range start"); - if(!start_result.ok()) { - return {start_result.code, start_result.message}; - } - auto end_result = sortable_from_json(val[1], "Range end"); - if(!end_result.ok()) { - return {end_result.code, end_result.message}; - } - if(start_result.value_or_throw() > end_result.value_or_throw()) { - return {2, "Invalid range: start > end"}; - } - - return numeric_index_->check_range(field, id, start_result.value_or_throw(), end_result.value_or_throw()); - } - - if(op == "$lt" || op == "$lte" || op == "$gt" || op == "$gte") { - auto bound_result = numeric_bound_from_comparison(op, val); - if(!bound_result.ok()) { - return {bound_result.code, bound_result.message}; - } - auto [min_val, max_val] = bound_result.value_or_throw(); - if(min_val > max_val) { - return {SUCCESS, "", false}; - } - return numeric_index_->check_range(field, id, min_val, max_val); - } - - return {2, "Unsupported numeric operator: " + op}; - } + const nlohmann::json& val) const; }; diff --git a/src/filter/numeric_index.cpp b/src/filter/numeric_index.cpp new file mode 100644 index 0000000000..bf72b28691 --- /dev/null +++ b/src/filter/numeric_index.cpp @@ -0,0 +1,905 @@ +#include "numeric_index.hpp" + +#include +#include +#include + +namespace ndd { + namespace filter { + + NumericBatchEntry::NumericBatchEntry(std::string field_in, + ndd::idInt id_in, + uint32_t value_in) : + field(std::move(field_in)), + id(id_in), + value(value_in) {} + + ndd::OperationResult + Bucket::read_bitmap_payload(const uint8_t* data, size_t len) { + if(len == 0) { + return {SUCCESS, "", ndd::RoaringBitmap()}; + } + if(data == nullptr) { + return {200, "empty bitmap payload"}; + } + + const char* bytes = reinterpret_cast(data); + const size_t consumed = + roaring::api::roaring_bitmap_portable_deserialize_size(bytes, len); + if(consumed == 0) { + return {200, "invalid or truncated bitmap payload"}; + } + if(consumed != len) { + return {200, + "bitmap payload length mismatch: consumed " + + std::to_string(consumed) + " of " + + std::to_string(len) + " bytes"}; + } + + ndd::RoaringBitmap bitmap; + try { + bitmap = ndd::RoaringBitmap::readSafe(bytes, len); + } catch(const std::exception& e) { + return {200, + "failed to deserialize bitmap payload: " + std::string(e.what())}; + } + + const char* reason = nullptr; + if(!roaring::api::roaring_bitmap_internal_validate(&bitmap.roaring, &reason)) { + return {200, + std::string("invalid bitmap internals") + + (reason != nullptr ? ": " + std::string(reason) : "")}; + } + return {SUCCESS, "", std::move(bitmap)}; + } + + void Bucket::add(uint32_t val, ndd::idInt id) { + if (val < base_value) { + // Should not happen if Key logic is correct + throw std::runtime_error("Insert value < Base Value"); + } + uint32_t delta_32 = val - base_value; + if (delta_32 > MAX_DELTA) { + throw std::runtime_error("Delta overflow"); + } + + // Maintain sorted order by Value (Delta) + uint16_t delta = static_cast(delta_32); + + // Find insertion point + auto it = std::lower_bound(deltas.begin(), deltas.end(), delta); + size_t index = std::distance(deltas.begin(), it); + + deltas.insert(it, delta); + ids.insert(ids.begin() + index, id); + + summary_bitmap.add(id); + is_dirty = true; + } + + bool Bucket::remove(ndd::idInt id) { + // Find index by ID (linear scan needed as ids are not sorted) + for (size_t i = 0; i < ids.size(); ++i) { + if (ids[i] == id) { + ids.erase(ids.begin() + i); + deltas.erase(deltas.begin() + i); + + // Rebuild or update bitmap? Roaring remove is fast + summary_bitmap.remove(id); + is_dirty = true; + return true; + } + } + return false; + } + + std::vector Bucket::serialize() const { + // Optimize bitmap + const_cast(summary_bitmap).runOptimize(); + + size_t bm_size = summary_bitmap.getSizeInBytes(); + uint16_t count = static_cast(ids.size()); + + size_t total_size = 4 + bm_size + 2 + (count * 2) + (count * sizeof(ndd::idInt)); + std::vector buffer(total_size); + uint8_t* ptr = buffer.data(); + + // 1. Bitmap Header + uint32_t bm_size_32 = static_cast(bm_size); + std::memcpy(ptr, &bm_size_32, 4); ptr += 4; + + // 2. Bitmap Data + if (bm_size > 0) { + summary_bitmap.write(reinterpret_cast(ptr)); + ptr += bm_size; + } + + // 3. Count + std::memcpy(ptr, &count, 2); ptr += 2; + + // 4. Deltas + if (count > 0) { + std::memcpy(ptr, deltas.data(), count * 2); ptr += count * 2; + } + + // 5. IDs + if (count > 0) { + std::memcpy(ptr, ids.data(), count * sizeof(ndd::idInt)); + } + + return buffer; + } + + Bucket Bucket::deserialize(const void* data, size_t len, uint32_t base_val) { + Bucket b; + b.base_value = base_val; + + if (len < 6) return b; // Min valid size + + const uint8_t* ptr = static_cast(data); + const uint8_t* end = ptr + len; + + // 1. Bitmap Size + uint32_t bm_size; + std::memcpy(&bm_size, ptr, sizeof(uint32_t)); + ptr += sizeof(uint32_t); + if (bm_size > static_cast(end - ptr)) { + throw std::runtime_error("Bucket corrupt: invalid bitmap size"); + } + + // 2. Bitmap + if (bm_size > 0) { + auto bitmap_result = read_bitmap_payload(ptr, bm_size); + if(!bitmap_result.ok()) { + throw std::runtime_error("Bucket corrupt: " + + bitmap_result.message); + } + if(!bitmap_result.value.has_value()) { + throw std::runtime_error( + "Bucket corrupt: bitmap reader succeeded without a bitmap"); + } + b.summary_bitmap = std::move(*bitmap_result.value); + ptr += bm_size; + } + + if (ptr + 2 > end) throw std::runtime_error("Bucket corrupt: truncated count"); + + // 3. Count + uint16_t count; + std::memcpy(&count, ptr, 2); ptr += 2; + + // 4. Deltas & IDs + if (count > 0) { + size_t delta_size = count * 2; + size_t id_size = count * sizeof(ndd::idInt); + + if (ptr + delta_size + id_size > end) { + throw std::runtime_error("Bucket corrupt: truncated Data"); + } + + b.deltas.resize(count); + std::memcpy(b.deltas.data(), ptr, delta_size); ptr += delta_size; + + b.ids.resize(count); + std::memcpy(b.ids.data(), ptr, id_size); + } + + return b; + } + + ndd::RoaringBitmap Bucket::read_summary_bitmap(const void* data, size_t len) { + if (len < sizeof(uint32_t)) { + throw std::runtime_error("Bucket corrupt: missing bitmap size"); + } + const uint8_t* ptr = static_cast(data); + const uint8_t* end = ptr + len; + uint32_t bm_size; + std::memcpy(&bm_size, ptr, sizeof(uint32_t)); + ptr += sizeof(uint32_t); + if (bm_size > static_cast(end - ptr)) { + throw std::runtime_error("Bucket corrupt: invalid bitmap size"); + } + if (bm_size == 0) return ndd::RoaringBitmap(); + auto bitmap_result = read_bitmap_payload(ptr, bm_size); + if(!bitmap_result.ok()) { + throw std::runtime_error("Bucket corrupt: " + + bitmap_result.message); + } + if(!bitmap_result.value.has_value()) { + throw std::runtime_error( + "Bucket corrupt: bitmap reader succeeded without a bitmap"); + } + return std::move(*bitmap_result.value); + } + + std::string NumericIndex::make_forward_key(const std::string& field, ndd::idInt id) { + return field + ":" + std::to_string(id); + } + + std::string NumericIndex::make_bucket_key(const std::string& field, uint32_t start_val) { + uint32_t be_val = 0; +#if defined(__GNUC__) || defined(__clang__) + be_val = __builtin_bswap32(start_val); +#else + be_val = ((start_val >> 24) & 0xff) | ((start_val << 8) & 0xff0000) + | ((start_val >> 8) & 0xff00) | ((start_val << 24) & 0xff000000); +#endif + std::string key = field + ":"; + key.append(reinterpret_cast(&be_val), 4); + return key; + } + + uint32_t NumericIndex::parse_bucket_key_val(const std::string& key) { + if(key.size() < 4) { + return 0; + } + uint32_t be_val; + std::memcpy(&be_val, key.data() + key.size() - 4, 4); +#if defined(__GNUC__) || defined(__clang__) + return __builtin_bswap32(be_val); +#else + return ((be_val >> 24) & 0xff) | ((be_val << 8) & 0xff0000) + | ((be_val >> 8) & 0xff00) | ((be_val << 24) & 0xff000000); +#endif + } + + ndd::OperationResult<> + NumericIndex::remove_from_buckets(MDBX_txn* txn, + const std::string& field, + uint32_t value, + ndd::idInt id) { + // Find bucket + std::string bkey_str = make_bucket_key(field, value); + MDBX_val key{const_cast(bkey_str.data()), bkey_str.size()}; + MDBX_val data; + MDBX_cursor* cursor = nullptr; + int rc = mdbx_cursor_open(txn, inverted_dbi_, &cursor); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to open numeric bucket remove cursor: " + + std::string(mdbx_strerror(rc))}; + } + + /** + * Scan backward to find bucket covering 'value'. + * Logic to find correct bucket: + */ + rc = mdbx_cursor_get(cursor, &key, &data, MDBX_SET_RANGE); + if(rc == MDBX_SUCCESS) { + // Check if we are in right field & range + std::string found_key(static_cast(key.iov_base), key.iov_len); + if(found_key.rfind(field + ":", 0) != 0 + || parse_bucket_key_val(found_key) > value) { + rc = mdbx_cursor_get(cursor, &key, &data, MDBX_PREV); + } + } else if(rc == MDBX_NOTFOUND) { + /** + * The only possible bucket that could still contain + * value is the very last bucket in the database. + * Hence jumping there. + */ + rc = mdbx_cursor_get(cursor, &key, &data, MDBX_LAST); + } + + // Should be at correct bucket now + if(rc != MDBX_SUCCESS) { + mdbx_cursor_close(cursor); + if(rc == MDBX_NOTFOUND) { + return {SUCCESS, ""}; + } + return {100, "Failed to locate numeric bucket for remove: " + + std::string(mdbx_strerror(rc))}; + } + + std::string found_key(static_cast(key.iov_base), key.iov_len); + if(found_key.rfind(field + ":", 0) != 0) { + mdbx_cursor_close(cursor); + return {SUCCESS, ""}; + } + + uint32_t bucket_base = parse_bucket_key_val(found_key); + if(value < bucket_base) { + mdbx_cursor_close(cursor); + return {SUCCESS, ""}; + } + + try { + Bucket bucket = Bucket::deserialize(data.iov_base, data.iov_len, bucket_base); + if(bucket.remove(id)) { + // Save back or Delete if empty + if(bucket.is_empty()) { + rc = mdbx_cursor_del(cursor, static_cast(0)); + if(rc != MDBX_SUCCESS) { + mdbx_cursor_close(cursor); + return {100, "Failed to delete empty numeric bucket: " + + std::string(mdbx_strerror(rc))}; + } + } else { + auto bytes = bucket.serialize(); + MDBX_val new_data{bytes.data(), bytes.size()}; + rc = mdbx_cursor_put(cursor, &key, &new_data, MDBX_CURRENT); + if(rc != MDBX_SUCCESS) { + mdbx_cursor_close(cursor); + return {100, "Failed to update numeric bucket after remove: " + + std::string(mdbx_strerror(rc))}; + } + } + } + } catch(const std::exception& e) { + mdbx_cursor_close(cursor); + return {200, "Corrupt numeric bucket while removing id: " + + std::string(e.what())}; + } + + mdbx_cursor_close(cursor); + return {SUCCESS, ""}; + } + + ndd::OperationResult<> + NumericIndex::add_to_buckets(MDBX_txn* txn, + const std::string& field, + uint32_t value, + ndd::idInt id) { + MDBX_cursor* cursor = nullptr; + int rc = mdbx_cursor_open(txn, inverted_dbi_, &cursor); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to open numeric bucket add cursor: " + + std::string(mdbx_strerror(rc))}; + } + + // Find candidate bucket + std::string search_key = make_bucket_key(field, value); + MDBX_val key{const_cast(search_key.data()), search_key.size()}; + MDBX_val data; + + // Move logic to find predecessor + rc = mdbx_cursor_get(cursor, &key, &data, MDBX_SET_RANGE); + if(rc == MDBX_SUCCESS) { + std::string found_key(static_cast(key.iov_base), key.iov_len); + if(found_key.rfind(field + ":", 0) != 0 + || parse_bucket_key_val(found_key) > value) { + int prev_rc = mdbx_cursor_get(cursor, &key, &data, MDBX_PREV); + if(prev_rc == MDBX_SUCCESS) { + rc = prev_rc; + } else if(prev_rc != MDBX_NOTFOUND) { + mdbx_cursor_close(cursor); + return {100, "Failed to seek previous numeric bucket: " + + std::string(mdbx_strerror(prev_rc))}; + } else { + rc = MDBX_NOTFOUND; + } + } + } else if(rc == MDBX_NOTFOUND) { + rc = mdbx_cursor_get(cursor, &key, &data, MDBX_LAST); + if(rc != MDBX_SUCCESS && rc != MDBX_NOTFOUND) { + mdbx_cursor_close(cursor); + return {100, "Failed to seek last numeric bucket: " + + std::string(mdbx_strerror(rc))}; + } + } else { + mdbx_cursor_close(cursor); + return {100, "Failed to seek numeric bucket: " + + std::string(mdbx_strerror(rc))}; + } + + bool create_new = true; + std::string target_key_str; + uint32_t target_base = 0; + if(rc == MDBX_SUCCESS) { + std::string found_key(static_cast(key.iov_base), key.iov_len); + if(found_key.rfind(field + ":", 0) == 0) { + target_base = parse_bucket_key_val(found_key); + + // Check range condition + if(value >= target_base + && (static_cast(value) - target_base) + <= Bucket::MAX_DELTA) { + target_key_str = found_key; + create_new = false; + } + } + } + + try { + if(create_new) { + // Create new bucket at exact value + Bucket bucket; + bucket.base_value = value; + bucket.add(value, id); + auto bytes = bucket.serialize(); + + target_key_str = make_bucket_key(field, value); + MDBX_val k{const_cast(target_key_str.data()), + target_key_str.size()}; + MDBX_val v{bytes.data(), bytes.size()}; + rc = mdbx_put(txn, inverted_dbi_, &k, &v, MDBX_UPSERT); + if(rc != MDBX_SUCCESS) { + mdbx_cursor_close(cursor); + return {100, "Failed to create numeric bucket: " + + std::string(mdbx_strerror(rc))}; + } + } else { + /** + * Update existing. + * We must re-fetch current key/data because cursor move might have updated key/data. + */ + MDBX_val k{const_cast(target_key_str.data()), + target_key_str.size()}; + MDBX_val v; + rc = mdbx_cursor_get(cursor, &k, &v, MDBX_SET); + if(rc != MDBX_SUCCESS) { + // Should not happen if logic is correct + mdbx_cursor_close(cursor); + return {200, "Failed to resync numeric bucket cursor: " + + std::string(mdbx_strerror(rc))}; + } + + Bucket bucket = Bucket::deserialize(v.iov_base, v.iov_len, target_base); + // Capacity Check + if(bucket.ids.size() >= Bucket::MAX_SIZE) { + /** + * SPLIT LOGIC + * Sort is maintained by arrays. + * "Slide Split": Scan right from median. + * Ensure we don't split a group of identical values. + */ + size_t mid_idx = bucket.ids.size() / 2; + size_t probe_right = mid_idx; + while(probe_right < bucket.deltas.size() && probe_right > 0 + && bucket.deltas[probe_right] + == bucket.deltas[probe_right - 1]) { + probe_right++; + } + + if(probe_right < bucket.deltas.size()) { + mid_idx = probe_right; + } else { + // Fallback: Try scanning left + size_t probe_left = mid_idx; + while(probe_left > 0 + && bucket.deltas[probe_left] + == bucket.deltas[probe_left - 1]) { + probe_left--; + } + // All identical + mid_idx = probe_left > 0 ? probe_left : bucket.deltas.size(); + } + + // If we hit end, we can't split by value uniqueness + if(mid_idx == bucket.deltas.size()) { + /** + * Fallback: Just append (overfill) or implement logic to handle identicals. + * For now: Append. + */ + bucket.add(value, id); + auto bytes = bucket.serialize(); + MDBX_val k2{const_cast(target_key_str.data()), + target_key_str.size()}; + MDBX_val v2{bytes.data(), bytes.size()}; + rc = mdbx_cursor_put(cursor, &k2, &v2, MDBX_CURRENT); + mdbx_cursor_close(cursor); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to update overfull numeric bucket: " + + std::string(mdbx_strerror(rc))}; + } + return {SUCCESS, ""}; + } + + // Standard Slide Split + Bucket right_bucket; + right_bucket.base_value = bucket.base_value + bucket.deltas[mid_idx]; + // Move entries + for(size_t i = mid_idx; i < bucket.deltas.size(); ++i) { + right_bucket.add(bucket.base_value + bucket.deltas[i], + bucket.ids[i]); + } + + // Truncate left + bucket.deltas.resize(mid_idx); + bucket.ids.resize(mid_idx); + // Rebuild left bitmap + bucket.summary_bitmap = ndd::RoaringBitmap(); + for(auto bucket_id : bucket.ids) { + bucket.summary_bitmap.add(bucket_id); + } + + // Now add new value to correct bucket + if(value >= right_bucket.base_value) { + right_bucket.add(value, id); + } else { + /** + * If value < right, goes to left. + * But wait, split point was determined by existing items. + * If new value is >= base+split_delta, it goes right. + * BUT we just cleared right from b. + * Correct logic: + * Oh wait, if we added to left, we might overflow again or break order? + * Simply: Check which bucket covers it. + * Left covers [Base, RightBase-1]. + * Right covers [RightBase, ...]. + */ + bucket.add(value, id); + } + + // Save Left + auto left_bytes = bucket.serialize(); + MDBX_val left_v{left_bytes.data(), left_bytes.size()}; + MDBX_val left_k{const_cast(target_key_str.data()), + target_key_str.size()}; + rc = mdbx_cursor_put(cursor, &left_k, &left_v, MDBX_CURRENT); + if(rc != MDBX_SUCCESS) { + mdbx_cursor_close(cursor); + return {100, "Failed to update split numeric bucket: " + + std::string(mdbx_strerror(rc))}; + } + + // Save Right + auto right_bytes = right_bucket.serialize(); + std::string right_k_str = + make_bucket_key(field, right_bucket.base_value); + MDBX_val right_k{const_cast(right_k_str.data()), + right_k_str.size()}; + MDBX_val right_v{right_bytes.data(), right_bytes.size()}; + // Use put for new key + rc = mdbx_put(txn, inverted_dbi_, &right_k, &right_v, MDBX_UPSERT); + if(rc != MDBX_SUCCESS) { + mdbx_cursor_close(cursor); + return {100, "Failed to write split numeric bucket: " + + std::string(mdbx_strerror(rc))}; + } + } else { + // Normal Insert + bucket.add(value, id); + auto bytes = bucket.serialize(); + MDBX_val new_data{bytes.data(), bytes.size()}; + // Use cursor put to update current + rc = mdbx_cursor_put(cursor, &k, &new_data, MDBX_CURRENT); + if(rc != MDBX_SUCCESS) { + mdbx_cursor_close(cursor); + return {100, "Failed to update numeric bucket: " + + std::string(mdbx_strerror(rc))}; + } + } + } + } catch(const std::exception& e) { + mdbx_cursor_close(cursor); + return {200, "Corrupt numeric bucket while adding id: " + + std::string(e.what())}; + } + + mdbx_cursor_close(cursor); + return {SUCCESS, ""}; + } + + ndd::OperationResult<> + NumericIndex::put_internal(MDBX_txn* txn, + const std::string& field, + ndd::idInt id, + uint32_t value) { + + // 1. Check Forward Index + std::string fwd_key_str = make_forward_key(field, id); + MDBX_val fwd_key{const_cast(fwd_key_str.data()), fwd_key_str.size()}; + MDBX_val fwd_val; + + int rc = mdbx_get(txn, forward_dbi_, &fwd_key, &fwd_val); + if(rc == MDBX_SUCCESS) { + if(fwd_val.iov_len != sizeof(uint32_t)) { + return {200, "Corrupt numeric forward value for field '" + field + "'"}; + } + uint32_t old_val; + std::memcpy(&old_val, fwd_val.iov_base, sizeof(uint32_t)); + if(old_val == value) { + return {SUCCESS, ""}; + } + auto remove_result = remove_from_buckets(txn, field, old_val, id); + if(!remove_result.ok()) { + return remove_result; + } + } else if(rc != MDBX_NOTFOUND) { + return {100, "Failed to read numeric forward value: " + + std::string(mdbx_strerror(rc))}; + } + + // 2. Update Forward + MDBX_val new_val_data{&value, sizeof(uint32_t)}; + rc = mdbx_put(txn, forward_dbi_, &fwd_key, &new_val_data, MDBX_UPSERT); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to write numeric forward value: " + + std::string(mdbx_strerror(rc))}; + } + + // 3. Add to Inverted Buckets + return add_to_buckets(txn, field, value, id); + } + + NumericIndex::NumericIndex(MDBX_env* env) : + env_(env) { + MDBX_txn* txn = nullptr; + int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); + if(rc != MDBX_SUCCESS) { + throw std::runtime_error(std::string("Failed to begin NumericIndex init: ") + + mdbx_strerror(rc)); + } + + rc = mdbx_dbi_open(txn, "numeric_forward", MDBX_CREATE, &forward_dbi_); + if(rc != MDBX_SUCCESS) { + mdbx_txn_abort(txn); + throw std::runtime_error(std::string("Failed to open numeric_forward dbi: ") + + mdbx_strerror(rc)); + } + + rc = mdbx_dbi_open(txn, "numeric_inverted", MDBX_CREATE, &inverted_dbi_); + if(rc != MDBX_SUCCESS) { + mdbx_txn_abort(txn); + throw std::runtime_error(std::string("Failed to open numeric_inverted dbi: ") + + mdbx_strerror(rc)); + } + + rc = mdbx_txn_commit(txn); + if(rc != MDBX_SUCCESS) { + throw std::runtime_error(std::string("Failed to commit NumericIndex init: ") + + mdbx_strerror(rc)); + } + } + + ndd::OperationResult<> + NumericIndex::put_batch(const std::vector& entries) { + if(entries.empty()) { + return {SUCCESS, ""}; + } + + for(size_t start = 0; start < entries.size(); start += BATCH_TXN_CHUNK_SIZE) { + size_t end = std::min(start + BATCH_TXN_CHUNK_SIZE, entries.size()); + + MDBX_txn* txn = nullptr; + int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to begin numeric batch write transaction: " + + std::string(mdbx_strerror(rc))}; + } + + for(size_t i = start; i < end; ++i) { + const auto& entry = entries[i]; + auto put_result = put_internal(txn, entry.field, entry.id, entry.value); + if(!put_result.ok()) { + mdbx_txn_abort(txn); + return put_result; + } + } + + rc = mdbx_txn_commit(txn); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to commit numeric batch write transaction: " + + std::string(mdbx_strerror(rc))}; + } + } + return {SUCCESS, ""}; + } + + ndd::OperationResult<> + NumericIndex::remove(const std::string& field, ndd::idInt id) { + MDBX_txn* txn = nullptr; + int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to begin numeric remove transaction: " + + std::string(mdbx_strerror(rc))}; + } + + std::string fwd_key_str = make_forward_key(field, id); + MDBX_val fwd_key{const_cast(fwd_key_str.data()), fwd_key_str.size()}; + MDBX_val fwd_val; + + rc = mdbx_get(txn, forward_dbi_, &fwd_key, &fwd_val); + if(rc == MDBX_NOTFOUND) { + mdbx_txn_abort(txn); + return {SUCCESS, ""}; + } + if(rc != MDBX_SUCCESS) { + mdbx_txn_abort(txn); + return {100, "Failed to read numeric forward value for remove: " + + std::string(mdbx_strerror(rc))}; + } + if(fwd_val.iov_len != sizeof(uint32_t)) { + mdbx_txn_abort(txn); + return {200, "Corrupt numeric forward value for field '" + field + "'"}; + } + + uint32_t old_val; + std::memcpy(&old_val, fwd_val.iov_base, sizeof(uint32_t)); + auto remove_result = remove_from_buckets(txn, field, old_val, id); + if(!remove_result.ok()) { + mdbx_txn_abort(txn); + return remove_result; + } + + rc = mdbx_del(txn, forward_dbi_, &fwd_key, nullptr); + if(rc != MDBX_SUCCESS && rc != MDBX_NOTFOUND) { + mdbx_txn_abort(txn); + return {100, "Failed to delete numeric forward value: " + + std::string(mdbx_strerror(rc))}; + } + + rc = mdbx_txn_commit(txn); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to commit numeric remove transaction: " + + std::string(mdbx_strerror(rc))}; + } + return {SUCCESS, ""}; + } + + ndd::OperationResult + NumericIndex::range(const std::string& field, uint32_t min_val, uint32_t max_val) { + ndd::RoaringBitmap result; + MDBX_txn* txn = nullptr; + int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to begin numeric range transaction: " + + std::string(mdbx_strerror(rc))}; + } + + MDBX_cursor* cursor = nullptr; + rc = mdbx_cursor_open(txn, inverted_dbi_, &cursor); + if(rc != MDBX_SUCCESS) { + mdbx_txn_abort(txn); + return {100, "Failed to open numeric range cursor: " + + std::string(mdbx_strerror(rc))}; + } + + // 1. Find Start Bucket + std::string start_k = make_bucket_key(field, min_val); + MDBX_val key{const_cast(start_k.data()), start_k.size()}; + MDBX_val data; + + rc = mdbx_cursor_get(cursor, &key, &data, MDBX_SET_RANGE); + if(rc == MDBX_SUCCESS) { + std::string fkey(static_cast(key.iov_base), key.iov_len); + if(fkey.rfind(field + ":", 0) != 0 || parse_bucket_key_val(fkey) > min_val) { + // Check if we need to back up + MDBX_val prev_key = key; + MDBX_val prev_data; + // Check prev + int prev_rc = mdbx_cursor_get(cursor, &prev_key, &prev_data, MDBX_PREV); + if(prev_rc == MDBX_SUCCESS) { + std::string prev_key_str(static_cast(prev_key.iov_base), + prev_key.iov_len); + if(prev_key_str.rfind(field + ":", 0) == 0) { + // Prev is valid start + key = prev_key; + data = prev_data; + } + } else if(prev_rc != MDBX_NOTFOUND) { + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); + return {100, "Failed to seek previous numeric range bucket: " + + std::string(mdbx_strerror(prev_rc))}; + } + } + } else if(rc == MDBX_NOTFOUND) { + rc = mdbx_cursor_get(cursor, &key, &data, MDBX_LAST); + if(rc == MDBX_SUCCESS) { + std::string fkey(static_cast(key.iov_base), key.iov_len); + if(fkey.rfind(field + ":", 0) != 0) { + rc = MDBX_NOTFOUND; + } + } else if(rc != MDBX_NOTFOUND) { + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); + return {100, "Failed to seek last numeric range bucket: " + + std::string(mdbx_strerror(rc))}; + } + } else { + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); + return {100, "Failed to seek numeric range bucket: " + + std::string(mdbx_strerror(rc))}; + } + + try { + // Iterate forward + while(rc == MDBX_SUCCESS) { + std::string cur_key(static_cast(key.iov_base), key.iov_len); + if(cur_key.rfind(field + ":", 0) != 0) { + break; + } + + uint32_t bucket_base = parse_bucket_key_val(cur_key); + if(bucket_base > max_val) { + break; + } + + /** + * Peek Strategy: + * If bucket_base >= min_val, we know the start is covered. + * If we could know NEXT bucket start, we'd know overlap. + * Since we iterate, we can be greedy on read. + * + * For now, always deserialize. + * Potential optimization: Read only bitmap if we are "deep" in the range. + * e.g. min_val=10, max_val=100. Bucket=20. + * If bucket=20. Next Bucket=30. + * Then Bucket 20 covers [20..30). + * Range [10..100] covers [20..30] fully. + * So we need lookahead. + * + * Simple logic without lookahead: + * Just read full bucket. It's 8KB max (2 pages). + * It's fast unless we have millions of buckets. + */ + Bucket bucket = Bucket::deserialize(data.iov_base, + data.iov_len, + bucket_base); + if(!bucket.ids.empty()) { + uint32_t bucket_min = bucket.get_value(0); + uint32_t bucket_max = bucket.get_value(bucket.ids.size() - 1); + + if(bucket_min >= min_val && bucket_max <= max_val) { + // Full overlap + result |= bucket.summary_bitmap; + } else { + // Partial overlap + for(size_t i = 0; i < bucket.ids.size(); ++i) { + uint32_t value = bucket.get_value(i); + if(value >= min_val && value <= max_val) { + result.add(bucket.ids[i]); + } + } + } + } + + rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT); + } + } catch(const std::exception& e) { + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); + return {200, "Corrupt numeric bucket during range scan: " + + std::string(e.what())}; + } + + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); + if(rc != MDBX_SUCCESS && rc != MDBX_NOTFOUND) { + return {100, "Failed during numeric range scan: " + + std::string(mdbx_strerror(rc))}; + } + return {SUCCESS, "", std::move(result)}; + } + + ndd::OperationResult + NumericIndex::check_range(const std::string& field, + ndd::idInt id, + uint32_t min_val, + uint32_t max_val) { + MDBX_txn* txn = nullptr; + int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn); + if(rc != MDBX_SUCCESS) { + return {100, "Failed to begin numeric check transaction: " + + std::string(mdbx_strerror(rc))}; + } + + std::string fwd_key_str = make_forward_key(field, id); + MDBX_val fwd_key{const_cast(fwd_key_str.data()), fwd_key_str.size()}; + MDBX_val fwd_val; + + rc = mdbx_get(txn, forward_dbi_, &fwd_key, &fwd_val); + if(rc == MDBX_NOTFOUND) { + mdbx_txn_abort(txn); + return {SUCCESS, "", false}; + } + if(rc != MDBX_SUCCESS) { + mdbx_txn_abort(txn); + return {100, "Failed to read numeric forward value during check: " + + std::string(mdbx_strerror(rc))}; + } + if(fwd_val.iov_len != sizeof(uint32_t)) { + mdbx_txn_abort(txn); + return {200, "Corrupt numeric forward value for field '" + field + "'"}; + } + + uint32_t value; + std::memcpy(&value, fwd_val.iov_base, sizeof(uint32_t)); + mdbx_txn_abort(txn); + return {SUCCESS, "", value >= min_val && value <= max_val}; + } + + } // namespace filter +} // namespace ndd diff --git a/src/filter/numeric_index.hpp b/src/filter/numeric_index.hpp index 84a43287a6..5eb283bf18 100644 --- a/src/filter/numeric_index.hpp +++ b/src/filter/numeric_index.hpp @@ -1,15 +1,11 @@ #pragma once #include -#include -#include #include -#include -#include -#include -#include +#include +#include + #include "mdbx/mdbx.h" -#include "../utils/log.hpp" #include "../core/types.hpp" #include "../utils/types.hpp" @@ -21,10 +17,7 @@ namespace ndd { ndd::idInt id; uint32_t value; - NumericBatchEntry(std::string field_in, ndd::idInt id_in, uint32_t value_in) : - field(std::move(field_in)), - id(id_in), - value(value_in) {} + NumericBatchEntry(std::string field_in, ndd::idInt id_in, uint32_t value_in); }; // --- Sortable Key Utilities --- @@ -77,189 +70,28 @@ namespace ndd { bool is_dirty = false; static ndd::OperationResult - read_bitmap_payload(const uint8_t* data, size_t len) { - if(len == 0) { - return {SUCCESS, "", ndd::RoaringBitmap()}; - } - if(data == nullptr) { - return {200, "empty bitmap payload"}; - } - - const char* bytes = reinterpret_cast(data); - const size_t consumed = - roaring::api::roaring_bitmap_portable_deserialize_size(bytes, len); - if(consumed == 0) { - return {200, "invalid or truncated bitmap payload"}; - } - if(consumed != len) { - return {200, - "bitmap payload length mismatch: consumed " - + std::to_string(consumed) + " of " - + std::to_string(len) + " bytes"}; - } - - ndd::RoaringBitmap bitmap; - try { - bitmap = ndd::RoaringBitmap::readSafe(bytes, len); - } catch(const std::exception& e) { - return {200, - "failed to deserialize bitmap payload: " + std::string(e.what())}; - } - - const char* reason = nullptr; - if(!roaring::api::roaring_bitmap_internal_validate(&bitmap.roaring, - &reason)) { - return {200, - std::string("invalid bitmap internals") - + (reason != nullptr ? ": " + std::string(reason) : "")}; - } - return {SUCCESS, "", std::move(bitmap)}; - } + read_bitmap_payload(const uint8_t* data, size_t len); // Helper to get actual value uint32_t get_value(size_t index) const { return base_value + deltas[index]; } - void add(uint32_t val, ndd::idInt id) { - if (val < base_value) { - // Should not happen if Key logic is correct - throw std::runtime_error("Insert value < Base Value"); - } - uint32_t delta_32 = val - base_value; - if (delta_32 > MAX_DELTA) { - throw std::runtime_error("Delta overflow"); - } - - // Maintain sorted order by Value (Delta) - uint16_t delta = static_cast(delta_32); - - // Find insertion point - auto it = std::lower_bound(deltas.begin(), deltas.end(), delta); - size_t index = std::distance(deltas.begin(), it); - - deltas.insert(it, delta); - ids.insert(ids.begin() + index, id); - - summary_bitmap.add(id); - is_dirty = true; - } - - bool remove(ndd::idInt id) { - // Find index by ID (linear scan needed as ids are not sorted) - for (size_t i = 0; i < ids.size(); ++i) { - if (ids[i] == id) { - ids.erase(ids.begin() + i); - deltas.erase(deltas.begin() + i); - - // Rebuild or update bitmap? Roaring remove is fast - summary_bitmap.remove(id); - is_dirty = true; - return true; - } - } - return false; - } - - // Serialization Format: - // [BitmapSize (4)] - // [Bitmap Bytes] - // [Count (2)] - // [Deltas (Count * 2)] - // [IDs (Count * sizeof(idInt))] - std::vector serialize() const { - // Optimize bitmap - const_cast(summary_bitmap).runOptimize(); - - size_t bm_size = summary_bitmap.getSizeInBytes(); - uint16_t count = static_cast(ids.size()); - - size_t total_size = 4 + bm_size + 2 + (count * 2) + (count * sizeof(ndd::idInt)); - std::vector buffer(total_size); - uint8_t* ptr = buffer.data(); - - // 1. Bitmap Header - uint32_t bm_size_32 = static_cast(bm_size); - std::memcpy(ptr, &bm_size_32, 4); ptr += 4; - - // 2. Bitmap Data - if (bm_size > 0) { - summary_bitmap.write(reinterpret_cast(ptr)); - ptr += bm_size; - } - - // 3. Count - std::memcpy(ptr, &count, 2); ptr += 2; - - // 4. Deltas - if (count > 0) { - std::memcpy(ptr, deltas.data(), count * 2); ptr += count * 2; - } - - // 5. IDs - if (count > 0) { - std::memcpy(ptr, ids.data(), count * sizeof(ndd::idInt)); - } - - return buffer; - } - - static Bucket deserialize(const void* data, size_t len, uint32_t base_val) { - Bucket b; - b.base_value = base_val; - - if (len < 6) return b; // Min valid size - - const uint8_t* ptr = static_cast(data); - const uint8_t* end = ptr + len; - - // 1. Bitmap Size - uint32_t bm_size; - std::memcpy(&bm_size, ptr, sizeof(uint32_t)); - ptr += sizeof(uint32_t); - if (bm_size > static_cast(end - ptr)) { - throw std::runtime_error("Bucket corrupt: invalid bitmap size"); - } - - // 2. Bitmap - if (bm_size > 0) { - auto bitmap_result = read_bitmap_payload(ptr, bm_size); - if(!bitmap_result.ok()) { - throw std::runtime_error("Bucket corrupt: " - + bitmap_result.message); - } - if(!bitmap_result.value.has_value()) { - throw std::runtime_error( - "Bucket corrupt: bitmap reader succeeded without a bitmap"); - } - b.summary_bitmap = std::move(*bitmap_result.value); - ptr += bm_size; - } + void add(uint32_t val, ndd::idInt id); - if (ptr + 2 > end) throw std::runtime_error("Bucket corrupt: truncated count"); + bool remove(ndd::idInt id); - // 3. Count - uint16_t count; - std::memcpy(&count, ptr, 2); ptr += 2; - - // 4. Deltas & IDs - if (count > 0) { - size_t delta_size = count * 2; - size_t id_size = count * sizeof(ndd::idInt); - - if (ptr + delta_size + id_size > end) { - throw std::runtime_error("Bucket corrupt: truncated Data"); - } - - b.deltas.resize(count); - std::memcpy(b.deltas.data(), ptr, delta_size); ptr += delta_size; + /** + * Serialization Format: + * [BitmapSize (4)] + * [Bitmap Bytes] + * [Count (2)] + * [Deltas (Count * 2)] + * [IDs (Count * sizeof(idInt))] + */ + std::vector serialize() const; - b.ids.resize(count); - std::memcpy(b.ids.data(), ptr, id_size); - } - - return b; - } + static Bucket deserialize(const void* data, size_t len, uint32_t base_val); /** * Fast access to just the bitmap. @@ -306,31 +138,7 @@ namespace ndd { * any corruption in the trailing region is caught by the full * `Bucket::deserialize` path that actually consumes those bytes. */ - static ndd::RoaringBitmap read_summary_bitmap(const void* data, - size_t len) { - if (len < sizeof(uint32_t)) { - throw std::runtime_error("Bucket corrupt: missing bitmap size"); - } - const uint8_t* ptr = static_cast(data); - const uint8_t* end = ptr + len; - uint32_t bm_size; - std::memcpy(&bm_size, ptr, sizeof(uint32_t)); - ptr += sizeof(uint32_t); - if (bm_size > static_cast(end - ptr)) { - throw std::runtime_error("Bucket corrupt: invalid bitmap size"); - } - if (bm_size == 0) return ndd::RoaringBitmap(); - auto bitmap_result = read_bitmap_payload(ptr, bm_size); - if(!bitmap_result.ok()) { - throw std::runtime_error("Bucket corrupt: " - + bitmap_result.message); - } - if(!bitmap_result.value.has_value()) { - throw std::runtime_error( - "Bucket corrupt: bitmap reader succeeded without a bitmap"); - } - return std::move(*bitmap_result.value); - } + static ndd::RoaringBitmap read_summary_bitmap(const void* data, size_t len); bool is_full() const { return ids.size() >= MAX_SIZE; } bool is_empty() const { return ids.empty(); } @@ -343,37 +151,12 @@ namespace ndd { MDBX_dbi inverted_dbi_; // BucketKey -> BucketBlob static constexpr size_t BATCH_TXN_CHUNK_SIZE = 256; - std::string make_forward_key(const std::string& field, ndd::idInt id) { - return field + ":" + std::to_string(id); - } + std::string make_forward_key(const std::string& field, ndd::idInt id); // Key Format: [Field]:[BigEndian_BaseValue] - std::string make_bucket_key(const std::string& field, uint32_t start_val) { - uint32_t be_val = 0; -#if defined(__GNUC__) || defined(__clang__) - be_val = __builtin_bswap32(start_val); -#else - be_val = ((start_val >> 24) & 0xff) | ((start_val << 8) & 0xff0000) - | ((start_val >> 8) & 0xff00) | ((start_val << 24) & 0xff000000); -#endif - std::string key = field + ":"; - key.append(reinterpret_cast(&be_val), 4); - return key; - } + std::string make_bucket_key(const std::string& field, uint32_t start_val); - uint32_t parse_bucket_key_val(const std::string& key) { - if(key.size() < 4) { - return 0; - } - uint32_t be_val; - std::memcpy(&be_val, key.data() + key.size() - 4, 4); -#if defined(__GNUC__) || defined(__clang__) - return __builtin_bswap32(be_val); -#else - return ((be_val >> 24) & 0xff) | ((be_val << 8) & 0xff0000) - | ((be_val >> 8) & 0xff00) | ((be_val << 24) & 0xff000000); -#endif - } + uint32_t parse_bucket_key_val(const std::string& key); /* * Removes one id from the numeric inverted bucket that currently owns its old value. @@ -386,92 +169,7 @@ namespace ndd { ndd::OperationResult<> remove_from_buckets(MDBX_txn* txn, const std::string& field, uint32_t value, - ndd::idInt id) { - // Find bucket - std::string bkey_str = make_bucket_key(field, value); - MDBX_val key{const_cast(bkey_str.data()), bkey_str.size()}; - MDBX_val data; - MDBX_cursor* cursor = nullptr; - int rc = mdbx_cursor_open(txn, inverted_dbi_, &cursor); - if(rc != MDBX_SUCCESS) { - return {100, "Failed to open numeric bucket remove cursor: " - + std::string(mdbx_strerror(rc))}; - } - - /** - * Scan backward to find bucket covering 'value'. - * Logic to find correct bucket: - */ - rc = mdbx_cursor_get(cursor, &key, &data, MDBX_SET_RANGE); - if(rc == MDBX_SUCCESS) { - // Check if we are in right field & range - std::string found_key(static_cast(key.iov_base), key.iov_len); - if(found_key.rfind(field + ":", 0) != 0 - || parse_bucket_key_val(found_key) > value) { - rc = mdbx_cursor_get(cursor, &key, &data, MDBX_PREV); - } - } else if(rc == MDBX_NOTFOUND) { - /** - * The only possible bucket that could still contain - * value is the very last bucket in the database. - * Hence jumping there. - */ - rc = mdbx_cursor_get(cursor, &key, &data, MDBX_LAST); - } - - // Should be at correct bucket now - if(rc != MDBX_SUCCESS) { - mdbx_cursor_close(cursor); - if(rc == MDBX_NOTFOUND) { - return {SUCCESS, ""}; - } - return {100, "Failed to locate numeric bucket for remove: " - + std::string(mdbx_strerror(rc))}; - } - - std::string found_key(static_cast(key.iov_base), key.iov_len); - if(found_key.rfind(field + ":", 0) != 0) { - mdbx_cursor_close(cursor); - return {SUCCESS, ""}; - } - - uint32_t bucket_base = parse_bucket_key_val(found_key); - if(value < bucket_base) { - mdbx_cursor_close(cursor); - return {SUCCESS, ""}; - } - - try { - Bucket bucket = Bucket::deserialize(data.iov_base, data.iov_len, bucket_base); - if(bucket.remove(id)) { - // Save back or Delete if empty - if(bucket.is_empty()) { - rc = mdbx_cursor_del(cursor, static_cast(0)); - if(rc != MDBX_SUCCESS) { - mdbx_cursor_close(cursor); - return {100, "Failed to delete empty numeric bucket: " - + std::string(mdbx_strerror(rc))}; - } - } else { - auto bytes = bucket.serialize(); - MDBX_val new_data{bytes.data(), bytes.size()}; - rc = mdbx_cursor_put(cursor, &key, &new_data, MDBX_CURRENT); - if(rc != MDBX_SUCCESS) { - mdbx_cursor_close(cursor); - return {100, "Failed to update numeric bucket after remove: " - + std::string(mdbx_strerror(rc))}; - } - } - } - } catch(const std::exception& e) { - mdbx_cursor_close(cursor); - return {200, "Corrupt numeric bucket while removing id: " - + std::string(e.what())}; - } - - mdbx_cursor_close(cursor); - return {SUCCESS, ""}; - } + ndd::idInt id); /* * Adds one id/value pair into the numeric inverted bucket index. @@ -484,237 +182,7 @@ namespace ndd { ndd::OperationResult<> add_to_buckets(MDBX_txn* txn, const std::string& field, uint32_t value, - ndd::idInt id) { - MDBX_cursor* cursor = nullptr; - int rc = mdbx_cursor_open(txn, inverted_dbi_, &cursor); - if(rc != MDBX_SUCCESS) { - return {100, "Failed to open numeric bucket add cursor: " - + std::string(mdbx_strerror(rc))}; - } - - // Find candidate bucket - std::string search_key = make_bucket_key(field, value); - MDBX_val key{const_cast(search_key.data()), search_key.size()}; - MDBX_val data; - - // Move logic to find predecessor - rc = mdbx_cursor_get(cursor, &key, &data, MDBX_SET_RANGE); - if(rc == MDBX_SUCCESS) { - std::string found_key(static_cast(key.iov_base), key.iov_len); - if(found_key.rfind(field + ":", 0) != 0 - || parse_bucket_key_val(found_key) > value) { - int prev_rc = mdbx_cursor_get(cursor, &key, &data, MDBX_PREV); - if(prev_rc == MDBX_SUCCESS) { - rc = prev_rc; - } else if(prev_rc != MDBX_NOTFOUND) { - mdbx_cursor_close(cursor); - return {100, "Failed to seek previous numeric bucket: " - + std::string(mdbx_strerror(prev_rc))}; - } else { - rc = MDBX_NOTFOUND; - } - } - } else if(rc == MDBX_NOTFOUND) { - rc = mdbx_cursor_get(cursor, &key, &data, MDBX_LAST); - if(rc != MDBX_SUCCESS && rc != MDBX_NOTFOUND) { - mdbx_cursor_close(cursor); - return {100, "Failed to seek last numeric bucket: " - + std::string(mdbx_strerror(rc))}; - } - } else { - mdbx_cursor_close(cursor); - return {100, "Failed to seek numeric bucket: " - + std::string(mdbx_strerror(rc))}; - } - - bool create_new = true; - std::string target_key_str; - uint32_t target_base = 0; - if(rc == MDBX_SUCCESS) { - std::string found_key(static_cast(key.iov_base), key.iov_len); - if(found_key.rfind(field + ":", 0) == 0) { - target_base = parse_bucket_key_val(found_key); - - // Check range condition - if(value >= target_base - && (static_cast(value) - target_base) - <= Bucket::MAX_DELTA) { - target_key_str = found_key; - create_new = false; - } - } - } - - try { - if(create_new) { - // Create new bucket at exact value - Bucket bucket; - bucket.base_value = value; - bucket.add(value, id); - auto bytes = bucket.serialize(); - - target_key_str = make_bucket_key(field, value); - MDBX_val k{const_cast(target_key_str.data()), - target_key_str.size()}; - MDBX_val v{bytes.data(), bytes.size()}; - rc = mdbx_put(txn, inverted_dbi_, &k, &v, MDBX_UPSERT); - if(rc != MDBX_SUCCESS) { - mdbx_cursor_close(cursor); - return {100, "Failed to create numeric bucket: " - + std::string(mdbx_strerror(rc))}; - } - } else { - /** - * Update existing. - * We must re-fetch current key/data because cursor move might have updated key/data. - */ - MDBX_val k{const_cast(target_key_str.data()), - target_key_str.size()}; - MDBX_val v; - rc = mdbx_cursor_get(cursor, &k, &v, MDBX_SET); - if(rc != MDBX_SUCCESS) { - // Should not happen if logic is correct - mdbx_cursor_close(cursor); - return {200, "Failed to resync numeric bucket cursor: " - + std::string(mdbx_strerror(rc))}; - } - - Bucket bucket = Bucket::deserialize(v.iov_base, v.iov_len, target_base); - // Capacity Check - if(bucket.ids.size() >= Bucket::MAX_SIZE) { - /** - * SPLIT LOGIC - * Sort is maintained by arrays. - * "Slide Split": Scan right from median. - * Ensure we don't split a group of identical values. - */ - size_t mid_idx = bucket.ids.size() / 2; - size_t probe_right = mid_idx; - while(probe_right < bucket.deltas.size() && probe_right > 0 - && bucket.deltas[probe_right] - == bucket.deltas[probe_right - 1]) { - probe_right++; - } - - if(probe_right < bucket.deltas.size()) { - mid_idx = probe_right; - } else { - // Fallback: Try scanning left - size_t probe_left = mid_idx; - while(probe_left > 0 - && bucket.deltas[probe_left] - == bucket.deltas[probe_left - 1]) { - probe_left--; - } - // All identical - mid_idx = probe_left > 0 ? probe_left : bucket.deltas.size(); - } - - // If we hit end, we can't split by value uniqueness - if(mid_idx == bucket.deltas.size()) { - /** - * Fallback: Just append (overfill) or implement logic to handle identicals. - * For now: Append. - */ - bucket.add(value, id); - auto bytes = bucket.serialize(); - MDBX_val k2{const_cast(target_key_str.data()), - target_key_str.size()}; - MDBX_val v2{bytes.data(), bytes.size()}; - rc = mdbx_cursor_put(cursor, &k2, &v2, MDBX_CURRENT); - mdbx_cursor_close(cursor); - if(rc != MDBX_SUCCESS) { - return {100, "Failed to update overfull numeric bucket: " - + std::string(mdbx_strerror(rc))}; - } - return {SUCCESS, ""}; - } - - // Standard Slide Split - Bucket right_bucket; - right_bucket.base_value = bucket.base_value + bucket.deltas[mid_idx]; - // Move entries - for(size_t i = mid_idx; i < bucket.deltas.size(); ++i) { - right_bucket.add(bucket.base_value + bucket.deltas[i], - bucket.ids[i]); - } - - // Truncate left - bucket.deltas.resize(mid_idx); - bucket.ids.resize(mid_idx); - // Rebuild left bitmap - bucket.summary_bitmap = ndd::RoaringBitmap(); - for(auto bucket_id : bucket.ids) { - bucket.summary_bitmap.add(bucket_id); - } - - // Now add new value to correct bucket - if(value >= right_bucket.base_value) { - right_bucket.add(value, id); - } else { - /** - * If value < right, goes to left. - * But wait, split point was determined by existing items. - * If new value is >= base+split_delta, it goes right. - * BUT we just cleared right from b. - * Correct logic: - * Oh wait, if we added to left, we might overflow again or break order? - * Simply: Check which bucket covers it. - * Left covers [Base, RightBase-1]. - * Right covers [RightBase, ...]. - */ - bucket.add(value, id); - } - - // Save Left - auto left_bytes = bucket.serialize(); - MDBX_val left_v{left_bytes.data(), left_bytes.size()}; - MDBX_val left_k{const_cast(target_key_str.data()), - target_key_str.size()}; - rc = mdbx_cursor_put(cursor, &left_k, &left_v, MDBX_CURRENT); - if(rc != MDBX_SUCCESS) { - mdbx_cursor_close(cursor); - return {100, "Failed to update split numeric bucket: " - + std::string(mdbx_strerror(rc))}; - } - - // Save Right - auto right_bytes = right_bucket.serialize(); - std::string right_k_str = - make_bucket_key(field, right_bucket.base_value); - MDBX_val right_k{const_cast(right_k_str.data()), - right_k_str.size()}; - MDBX_val right_v{right_bytes.data(), right_bytes.size()}; - // Use put for new key - rc = mdbx_put(txn, inverted_dbi_, &right_k, &right_v, MDBX_UPSERT); - if(rc != MDBX_SUCCESS) { - mdbx_cursor_close(cursor); - return {100, "Failed to write split numeric bucket: " - + std::string(mdbx_strerror(rc))}; - } - } else { - // Normal Insert - bucket.add(value, id); - auto bytes = bucket.serialize(); - MDBX_val new_data{bytes.data(), bytes.size()}; - // Use cursor put to update current - rc = mdbx_cursor_put(cursor, &k, &new_data, MDBX_CURRENT); - if(rc != MDBX_SUCCESS) { - mdbx_cursor_close(cursor); - return {100, "Failed to update numeric bucket: " - + std::string(mdbx_strerror(rc))}; - } - } - } - } catch(const std::exception& e) { - mdbx_cursor_close(cursor); - return {200, "Corrupt numeric bucket while adding id: " - + std::string(e.what())}; - } - - mdbx_cursor_close(cursor); - return {SUCCESS, ""}; - } + ndd::idInt id); /* * Writes one numeric forward entry and updates the inverted buckets inside a caller transaction. @@ -729,74 +197,10 @@ namespace ndd { ndd::OperationResult<> put_internal(MDBX_txn* txn, const std::string& field, ndd::idInt id, - uint32_t value) { - - // 1. Check Forward Index - std::string fwd_key_str = make_forward_key(field, id); - MDBX_val fwd_key{const_cast(fwd_key_str.data()), fwd_key_str.size()}; - MDBX_val fwd_val; - - int rc = mdbx_get(txn, forward_dbi_, &fwd_key, &fwd_val); - if(rc == MDBX_SUCCESS) { - if(fwd_val.iov_len != sizeof(uint32_t)) { - return {200, "Corrupt numeric forward value for field '" + field + "'"}; - } - uint32_t old_val; - std::memcpy(&old_val, fwd_val.iov_base, sizeof(uint32_t)); - if(old_val == value) { - return {SUCCESS, ""}; - } - auto remove_result = remove_from_buckets(txn, field, old_val, id); - if(!remove_result.ok()) { - return remove_result; - } - } else if(rc != MDBX_NOTFOUND) { - return {100, "Failed to read numeric forward value: " - + std::string(mdbx_strerror(rc))}; - } - - // 2. Update Forward - MDBX_val new_val_data{&value, sizeof(uint32_t)}; - rc = mdbx_put(txn, forward_dbi_, &fwd_key, &new_val_data, MDBX_UPSERT); - if(rc != MDBX_SUCCESS) { - return {100, "Failed to write numeric forward value: " - + std::string(mdbx_strerror(rc))}; - } - - // 3. Add to Inverted Buckets - return add_to_buckets(txn, field, value, id); - } + uint32_t value); public: - NumericIndex(MDBX_env* env) : - env_(env) { - MDBX_txn* txn = nullptr; - int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); - if(rc != MDBX_SUCCESS) { - throw std::runtime_error(std::string("Failed to begin NumericIndex init: ") - + mdbx_strerror(rc)); - } - - rc = mdbx_dbi_open(txn, "numeric_forward", MDBX_CREATE, &forward_dbi_); - if(rc != MDBX_SUCCESS) { - mdbx_txn_abort(txn); - throw std::runtime_error(std::string("Failed to open numeric_forward dbi: ") - + mdbx_strerror(rc)); - } - - rc = mdbx_dbi_open(txn, "numeric_inverted", MDBX_CREATE, &inverted_dbi_); - if(rc != MDBX_SUCCESS) { - mdbx_txn_abort(txn); - throw std::runtime_error(std::string("Failed to open numeric_inverted dbi: ") - + mdbx_strerror(rc)); - } - - rc = mdbx_txn_commit(txn); - if(rc != MDBX_SUCCESS) { - throw std::runtime_error(std::string("Failed to commit NumericIndex init: ") - + mdbx_strerror(rc)); - } - } + NumericIndex(MDBX_env* env); /* * Writes a batch of numeric filter entries in bounded MDBX write transaction chunks. @@ -807,38 +211,7 @@ namespace ndd { * 100-199 = propagated MDBX/storage failure from per-entry writes * 200-299 = propagated corruption/invariant failure from per-entry writes */ - ndd::OperationResult<> put_batch(const std::vector& entries) { - if(entries.empty()) { - return {SUCCESS, ""}; - } - - for(size_t start = 0; start < entries.size(); start += BATCH_TXN_CHUNK_SIZE) { - size_t end = std::min(start + BATCH_TXN_CHUNK_SIZE, entries.size()); - - MDBX_txn* txn = nullptr; - int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); - if(rc != MDBX_SUCCESS) { - return {100, "Failed to begin numeric batch write transaction: " - + std::string(mdbx_strerror(rc))}; - } - - for(size_t i = start; i < end; ++i) { - const auto& entry = entries[i]; - auto put_result = put_internal(txn, entry.field, entry.id, entry.value); - if(!put_result.ok()) { - mdbx_txn_abort(txn); - return put_result; - } - } - - rc = mdbx_txn_commit(txn); - if(rc != MDBX_SUCCESS) { - return {100, "Failed to commit numeric batch write transaction: " - + std::string(mdbx_strerror(rc))}; - } - } - return {SUCCESS, ""}; - } + ndd::OperationResult<> put_batch(const std::vector& entries); /* * Removes one id from the numeric forward and inverted indexes for a field. @@ -850,55 +223,7 @@ namespace ndd { * 200 = corrupt numeric forward value; caller should log ERROR and return HTTP 500 * 200-299 = propagated corruption/invariant failure from bucket helpers */ - ndd::OperationResult<> remove(const std::string& field, ndd::idInt id) { - MDBX_txn* txn = nullptr; - int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_READWRITE, &txn); - if(rc != MDBX_SUCCESS) { - return {100, "Failed to begin numeric remove transaction: " - + std::string(mdbx_strerror(rc))}; - } - - std::string fwd_key_str = make_forward_key(field, id); - MDBX_val fwd_key{const_cast(fwd_key_str.data()), fwd_key_str.size()}; - MDBX_val fwd_val; - - rc = mdbx_get(txn, forward_dbi_, &fwd_key, &fwd_val); - if(rc == MDBX_NOTFOUND) { - mdbx_txn_abort(txn); - return {SUCCESS, ""}; - } - if(rc != MDBX_SUCCESS) { - mdbx_txn_abort(txn); - return {100, "Failed to read numeric forward value for remove: " - + std::string(mdbx_strerror(rc))}; - } - if(fwd_val.iov_len != sizeof(uint32_t)) { - mdbx_txn_abort(txn); - return {200, "Corrupt numeric forward value for field '" + field + "'"}; - } - - uint32_t old_val; - std::memcpy(&old_val, fwd_val.iov_base, sizeof(uint32_t)); - auto remove_result = remove_from_buckets(txn, field, old_val, id); - if(!remove_result.ok()) { - mdbx_txn_abort(txn); - return remove_result; - } - - rc = mdbx_del(txn, forward_dbi_, &fwd_key, nullptr); - if(rc != MDBX_SUCCESS && rc != MDBX_NOTFOUND) { - mdbx_txn_abort(txn); - return {100, "Failed to delete numeric forward value: " - + std::string(mdbx_strerror(rc))}; - } - - rc = mdbx_txn_commit(txn); - if(rc != MDBX_SUCCESS) { - return {100, "Failed to commit numeric remove transaction: " - + std::string(mdbx_strerror(rc))}; - } - return {SUCCESS, ""}; - } + ndd::OperationResult<> remove(const std::string& field, ndd::idInt id); /* * Computes a bitmap of ids whose numeric field value falls within an inclusive sortable range. @@ -909,141 +234,7 @@ namespace ndd { * 200 = corrupt numeric bucket payload; caller should log ERROR and return HTTP 500 */ ndd::OperationResult - range(const std::string& field, uint32_t min_val, uint32_t max_val) { - ndd::RoaringBitmap result; - MDBX_txn* txn = nullptr; - int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn); - if(rc != MDBX_SUCCESS) { - return {100, "Failed to begin numeric range transaction: " - + std::string(mdbx_strerror(rc))}; - } - - MDBX_cursor* cursor = nullptr; - rc = mdbx_cursor_open(txn, inverted_dbi_, &cursor); - if(rc != MDBX_SUCCESS) { - mdbx_txn_abort(txn); - return {100, "Failed to open numeric range cursor: " - + std::string(mdbx_strerror(rc))}; - } - - // 1. Find Start Bucket - std::string start_k = make_bucket_key(field, min_val); - MDBX_val key{const_cast(start_k.data()), start_k.size()}; - MDBX_val data; - - rc = mdbx_cursor_get(cursor, &key, &data, MDBX_SET_RANGE); - if(rc == MDBX_SUCCESS) { - std::string fkey(static_cast(key.iov_base), key.iov_len); - if(fkey.rfind(field + ":", 0) != 0 || parse_bucket_key_val(fkey) > min_val) { - // Check if we need to back up - MDBX_val prev_key = key; - MDBX_val prev_data; - // Check prev - int prev_rc = mdbx_cursor_get(cursor, &prev_key, &prev_data, MDBX_PREV); - if(prev_rc == MDBX_SUCCESS) { - std::string prev_key_str(static_cast(prev_key.iov_base), - prev_key.iov_len); - if(prev_key_str.rfind(field + ":", 0) == 0) { - // Prev is valid start - key = prev_key; - data = prev_data; - } - } else if(prev_rc != MDBX_NOTFOUND) { - mdbx_cursor_close(cursor); - mdbx_txn_abort(txn); - return {100, "Failed to seek previous numeric range bucket: " - + std::string(mdbx_strerror(prev_rc))}; - } - } - } else if(rc == MDBX_NOTFOUND) { - rc = mdbx_cursor_get(cursor, &key, &data, MDBX_LAST); - if(rc == MDBX_SUCCESS) { - std::string fkey(static_cast(key.iov_base), key.iov_len); - if(fkey.rfind(field + ":", 0) != 0) { - rc = MDBX_NOTFOUND; - } - } else if(rc != MDBX_NOTFOUND) { - mdbx_cursor_close(cursor); - mdbx_txn_abort(txn); - return {100, "Failed to seek last numeric range bucket: " - + std::string(mdbx_strerror(rc))}; - } - } else { - mdbx_cursor_close(cursor); - mdbx_txn_abort(txn); - return {100, "Failed to seek numeric range bucket: " - + std::string(mdbx_strerror(rc))}; - } - - try { - // Iterate forward - while(rc == MDBX_SUCCESS) { - std::string cur_key(static_cast(key.iov_base), key.iov_len); - if(cur_key.rfind(field + ":", 0) != 0) { - break; - } - - uint32_t bucket_base = parse_bucket_key_val(cur_key); - if(bucket_base > max_val) { - break; - } - - /** - * Peek Strategy: - * If bucket_base >= min_val, we know the start is covered. - * If we could know NEXT bucket start, we'd know overlap. - * Since we iterate, we can be greedy on read. - * - * For now, always deserialize. - * Potential optimization: Read only bitmap if we are "deep" in the range. - * e.g. min_val=10, max_val=100. Bucket=20. - * If bucket=20. Next Bucket=30. - * Then Bucket 20 covers [20..30). - * Range [10..100] covers [20..30] fully. - * So we need lookahead. - * - * Simple logic without lookahead: - * Just read full bucket. It's 8KB max (2 pages). - * It's fast unless we have millions of buckets. - */ - Bucket bucket = Bucket::deserialize(data.iov_base, - data.iov_len, - bucket_base); - if(!bucket.ids.empty()) { - uint32_t bucket_min = bucket.get_value(0); - uint32_t bucket_max = bucket.get_value(bucket.ids.size() - 1); - - if(bucket_min >= min_val && bucket_max <= max_val) { - // Full overlap - result |= bucket.summary_bitmap; - } else { - // Partial overlap - for(size_t i = 0; i < bucket.ids.size(); ++i) { - uint32_t value = bucket.get_value(i); - if(value >= min_val && value <= max_val) { - result.add(bucket.ids[i]); - } - } - } - } - - rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT); - } - } catch(const std::exception& e) { - mdbx_cursor_close(cursor); - mdbx_txn_abort(txn); - return {200, "Corrupt numeric bucket during range scan: " - + std::string(e.what())}; - } - - mdbx_cursor_close(cursor); - mdbx_txn_abort(txn); - if(rc != MDBX_SUCCESS && rc != MDBX_NOTFOUND) { - return {100, "Failed during numeric range scan: " - + std::string(mdbx_strerror(rc))}; - } - return {SUCCESS, "", std::move(result)}; - } + range(const std::string& field, uint32_t min_val, uint32_t max_val); /* * Checks whether one id has a numeric field value inside an inclusive sortable range. @@ -1057,38 +248,7 @@ namespace ndd { check_range(const std::string& field, ndd::idInt id, uint32_t min_val, - uint32_t max_val) { - MDBX_txn* txn = nullptr; - int rc = mdbx_txn_begin(env_, nullptr, MDBX_TXN_RDONLY, &txn); - if(rc != MDBX_SUCCESS) { - return {100, "Failed to begin numeric check transaction: " - + std::string(mdbx_strerror(rc))}; - } - - std::string fwd_key_str = make_forward_key(field, id); - MDBX_val fwd_key{const_cast(fwd_key_str.data()), fwd_key_str.size()}; - MDBX_val fwd_val; - - rc = mdbx_get(txn, forward_dbi_, &fwd_key, &fwd_val); - if(rc == MDBX_NOTFOUND) { - mdbx_txn_abort(txn); - return {SUCCESS, "", false}; - } - if(rc != MDBX_SUCCESS) { - mdbx_txn_abort(txn); - return {100, "Failed to read numeric forward value during check: " - + std::string(mdbx_strerror(rc))}; - } - if(fwd_val.iov_len != sizeof(uint32_t)) { - mdbx_txn_abort(txn); - return {200, "Corrupt numeric forward value for field '" + field + "'"}; - } - - uint32_t value; - std::memcpy(&value, fwd_val.iov_base, sizeof(uint32_t)); - mdbx_txn_abort(txn); - return {SUCCESS, "", value >= min_val && value <= max_val}; - } + uint32_t max_val); }; } // namespace filter diff --git a/src/utils/settings.hpp b/src/utils/settings.hpp index 9949e9109e..07210e7bc9 100644 --- a/src/utils/settings.hpp +++ b/src/utils/settings.hpp @@ -5,6 +5,7 @@ #include #include #include +#include constexpr uint64_t KB = (1024ULL); constexpr uint64_t MB = (1024ULL * KB); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2d32d9327c..e583c8ef64 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -14,7 +14,12 @@ file(GLOB LMDB_SOURCES ${CMAKE_SOURCE_DIR}/third_party/mdbx/*.c) set(ROARING_SOURCE ${CMAKE_SOURCE_DIR}/third_party/roaring_bitmap/roaring.c) # Create the test executable -add_executable(ndd_filter_test filter_test.cpp ${LMDB_SOURCES} ${ROARING_SOURCE}) +add_executable(ndd_filter_test + filter_test.cpp + ${NDD_FILTER_SOURCES} + ${LMDB_SOURCES} + ${ROARING_SOURCE} +) # Link against GTest target_link_libraries(ndd_filter_test GTest::gtest_main)