Skip to content

Commit 1445b0e

Browse files
committed
feat: Add vector search support to DataEvolutionBatchScan and rename
topk to vector search
1 parent 39fea58 commit 1445b0e

30 files changed

Lines changed: 706 additions & 323 deletions

include/paimon/global_index/bitmap_topk_global_index_result.h renamed to include/paimon/global_index/bitmap_vector_search_global_index_result.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,23 +26,23 @@
2626
#include "paimon/visibility.h"
2727

2828
namespace paimon {
29-
/// Represents a Top-K global index result that combines a Roaring bitmap of candidate row ids
29+
/// Represents a vector search global index result that combines a Roaring bitmap of candidate row ids
3030
/// with an array of associated relevance scores.
3131
///
32-
/// **Important Ordering Note**: Despite inheriting from TopKGlobalIndexResult, the results are
32+
/// **Important Ordering Note**: Despite inheriting from VectorSearchGlobalIndexResult, the results are
3333
/// **NOT sorted by score**. Instead, both the bitmap and the score vector are ordered by
3434
/// **ascending row id**. This design enables efficient merging and set operations while preserving
3535
/// row id-to-score mapping.
36-
class PAIMON_EXPORT BitmapTopKGlobalIndexResult : public TopKGlobalIndexResult {
36+
class PAIMON_EXPORT BitmapVectorSearchGlobalIndexResult : public VectorSearchGlobalIndexResult {
3737
public:
38-
BitmapTopKGlobalIndexResult(RoaringBitmap64&& bitmap, std::vector<float>&& scores)
38+
BitmapVectorSearchGlobalIndexResult(RoaringBitmap64&& bitmap, std::vector<float>&& scores)
3939
: bitmap_(std::move(bitmap)), scores_(std::move(scores)) {
4040
assert(static_cast<size_t>(bitmap_.Cardinality()) == scores_.size());
4141
}
4242

43-
class TopKIterator : public TopKGlobalIndexResult::TopKIterator {
43+
class VectorSearchIterator : public VectorSearchGlobalIndexResult::VectorSearchIterator {
4444
public:
45-
TopKIterator(const RoaringBitmap64* bitmap, RoaringBitmap64::Iterator&& iter,
45+
VectorSearchIterator(const RoaringBitmap64* bitmap, RoaringBitmap64::Iterator&& iter,
4646
const float* scores)
4747
: bitmap_(bitmap), iter_(std::move(iter)), scores_(scores) {}
4848

@@ -65,7 +65,7 @@ class PAIMON_EXPORT BitmapTopKGlobalIndexResult : public TopKGlobalIndexResult {
6565

6666
Result<std::unique_ptr<GlobalIndexResult::Iterator>> CreateIterator() const override;
6767

68-
Result<std::unique_ptr<TopKGlobalIndexResult::TopKIterator>> CreateTopKIterator()
68+
Result<std::unique_ptr<VectorSearchGlobalIndexResult::VectorSearchIterator>> CreateVectorSearchIterator()
6969
const override;
7070

7171
Result<std::shared_ptr<GlobalIndexResult>> And(

include/paimon/global_index/global_index_reader.h

Lines changed: 6 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
#include "paimon/global_index/global_index_result.h"
2424
#include "paimon/predicate/function_visitor.h"
25+
#include "paimon/predicate/vector_search.h"
2526
#include "paimon/visibility.h"
2627

2728
namespace paimon {
@@ -35,37 +36,12 @@ namespace paimon {
3536
/// ids** that start from 0 — not global row ids in the entire table.
3637
/// The `GlobalIndexResult` can be converted to global row ids by calling `AddOffset()`.
3738
class PAIMON_EXPORT GlobalIndexReader : public FunctionVisitor<std::shared_ptr<GlobalIndexResult>> {
38-
public:
39-
/// TopKPreFilter: A lightweight pre-filtering function applied **before** similarity scoring.
40-
/// It operates solely on **local row ids** and is typically driven by other global index, such
41-
/// as bitmap, or range index. This filter enables early pruning of irrelevant candidates (e.g.,
42-
/// "only consider rows with label X"), significantly reducing the search space. Returns true to
43-
/// include the row in Top-K computation; false to exclude it.
44-
///
45-
/// @note Must be thread-safe.
46-
using TopKPreFilter = std::function<bool(int64_t)>;
47-
48-
/// VisitTopK performs approximate top-k similarity search.
49-
///
50-
/// @param k Number of top results to return.
51-
/// @param query The query vector (must match the dimensionality of the indexed vectors).
52-
/// @param filter A pre-filter based on **local row ids**, implemented by leveraging other
53-
/// global index
54-
/// structures (e.g., bitmap index) for efficient candidate pruning.
55-
/// @param predicate A runtime filtering condition that may involve graph traversal of
56-
/// structured attributes. **Using this parameter often yields better
57-
/// filtering accuracy** because during index construction, the underlying
58-
/// graph was built with explicit consideration of field connectivity (e.g.,
59-
/// relationships between attributes). As a result, predicates can leverage
60-
/// this pre-established semantic structure to perform more meaningful and
61-
/// context-aware filtering at query time.
62-
/// @note All fields referenced in the predicate must have been materialized
63-
/// in the index during build to ensure availability.
64-
/// @note `VisitTopK` is thread-safe (not coroutine-safe) while other `VisitXXX` is not
39+
public:
40+
/// VisitVectorSearch performs approximate vector similarity search.
41+
/// @note `VisitVectorSearch` is thread-safe (not coroutine-safe) while other `VisitXXX` is not
6542
/// thread-safe.
66-
virtual Result<std::shared_ptr<TopKGlobalIndexResult>> VisitTopK(
67-
int32_t k, const std::vector<float>& query, TopKPreFilter filter,
68-
const std::shared_ptr<Predicate>& predicate) = 0;
43+
virtual Result<std::shared_ptr<VectorSearchGlobalIndexResult>> VisitVectorSearch(
44+
const std::shared_ptr<VectorSearch>& vector_search) = 0;
6945
};
7046

7147
} // namespace paimon

include/paimon/global_index/global_index_result.h

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this<Glob
7676
/// Serializes a GlobalIndexResult object into a byte array.
7777
///
7878
/// @note This method only supports the following concrete implementations:
79-
/// - BitmapTopKGlobalIndexResult
79+
/// - BitmapVectorSearchGlobalIndexResult
8080
/// - BitmapGlobalIndexResult
8181
///
8282
/// @param global_index_result The GlobalIndexResult instance to serialize (must not be null).
@@ -91,7 +91,7 @@ class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this<Glob
9191
///
9292
/// @note The concrete type of the deserialized object is determined by metadata
9393
/// embedded in the buffer. Currently, only the following types are supported:
94-
/// - BitmapTopKGlobalIndexResult
94+
/// - BitmapVectorSearchGlobalIndexResult
9595
/// - BitmapGlobalIndexResult
9696
///
9797
/// @param buffer Pointer to the serialized byte data (must not be null).
@@ -106,18 +106,18 @@ class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this<Glob
106106
static constexpr int32_t VERSION = 1;
107107
};
108108

109-
/// Represents the result of a Top-K query against a global index.
110-
/// This class encapsulates a set of top-K candidates (row id + score pairs) and provides
109+
/// Represents the result of a vector search query against a global index.
110+
/// This class encapsulates a set of search candidates (row id + score pairs) and provides
111111
/// an iterator interface to traverse them.
112-
class PAIMON_EXPORT TopKGlobalIndexResult : public GlobalIndexResult {
112+
class PAIMON_EXPORT VectorSearchGlobalIndexResult : public GlobalIndexResult {
113113
public:
114-
/// An iterator over the top-K results, returning (row_id, score) pairs.
114+
/// An iterator over the vector search results, returning (row_id, score) pairs.
115115
///
116116
/// @note The results are **NOT sorted by score**. Instead, they are returned in **ascending
117117
/// order of row_id**.
118-
class TopKIterator {
118+
class VectorSearchIterator {
119119
public:
120-
virtual ~TopKIterator() = default;
120+
virtual ~VectorSearchIterator() = default;
121121

122122
/// Checks whether more row ids are available.
123123
virtual bool HasNext() const = 0;
@@ -132,7 +132,7 @@ class PAIMON_EXPORT TopKGlobalIndexResult : public GlobalIndexResult {
132132
virtual std::pair<int64_t, float> NextWithScore() = 0;
133133
};
134134

135-
/// Creates a new iterator for traversing the Top-K results.
136-
virtual Result<std::unique_ptr<TopKIterator>> CreateTopKIterator() const = 0;
135+
/// Creates a new iterator for traversing the vector search results.
136+
virtual Result<std::unique_ptr<VectorSearchIterator>> CreateVectorSearchIterator() const = 0;
137137
};
138138
} // namespace paimon

include/paimon/global_index/row_range_global_index_scanner.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,7 @@ class PAIMON_EXPORT RowRangeGlobalIndexScanner {
5252
/// - Successful with several readers if the indexes exist and load correctly;
5353
/// - Successful with an empty vector if no index was built for the given field;
5454
/// - Error returns when loading fails (e.g., file corruption, I/O error, unsupported
55-
/// format) or the predicate method was incorrectly invoked (e.g., VisitTopK was invoked
56-
/// incorrectly).
55+
/// format).
5756
virtual Result<std::vector<std::shared_ptr<GlobalIndexReader>>> CreateReaders(
5857
const std::string& field_name) const = 0;
5958
};
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
/*
2+
* Copyright 2026-present Alibaba Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
#include <memory>
19+
#include <string>
20+
21+
#include "paimon/predicate/predicate.h"
22+
#include "paimon/visibility.h"
23+
24+
namespace paimon {
25+
/// `VectorSearch` to perform vector similarity search.
26+
struct PAIMON_EXPORT VectorSearch {
27+
/// `PreFilter`: A lightweight pre-filtering function applied **before** similarity
28+
/// scoring. It operates solely on **local row ids** and is typically driven by other global
29+
/// index, such as bitmap, or range index. This filter enables early pruning of irrelevant
30+
/// candidates (e.g., "only consider rows with label X"), significantly reducing the search
31+
/// space. Returns true to include the row in vector search process; false to exclude it.
32+
///
33+
/// @note Must be thread-safe.
34+
using PreFilter = std::function<bool(int64_t)>;
35+
36+
VectorSearch(const std::string& _field_name, int32_t _limit, const std::vector<float>& _query,
37+
PreFilter _pre_filter, const std::shared_ptr<Predicate>& _predicate)
38+
: field_name(_field_name),
39+
limit(_limit),
40+
query(_query),
41+
pre_filter(_pre_filter),
42+
predicate(_predicate) {}
43+
44+
/// Search field name.
45+
std::string field_name;
46+
/// Number of top results to return.
47+
int32_t limit;
48+
/// The query vector (must match the dimensionality of the indexed vectors).
49+
std::vector<float> query;
50+
/// A pre-filter based on **local row ids**, implemented by leveraging other global index
51+
std::function<bool(int64_t)> pre_filter;
52+
/// A runtime filtering condition that may involve graph traversal of
53+
/// structured attributes. **Using this parameter often yields better
54+
/// filtering accuracy** because during index construction, the underlying
55+
/// graph was built with explicit consideration of field connectivity (e.g.,
56+
/// relationships between attributes). As a result, predicates can leverage
57+
/// this pre-established semantic structure to perform more meaningful and
58+
/// context-aware filtering at query time.
59+
/// @note All fields referenced in the predicate must have been materialized
60+
/// in the index during build to ensure availability.
61+
std::shared_ptr<Predicate> predicate;
62+
};
63+
} // namespace paimon

include/paimon/scan_context.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
#include "paimon/global_index/global_index_result.h"
2727
#include "paimon/predicate/predicate.h"
28+
#include "paimon/predicate/vector_search.h"
2829
#include "paimon/result.h"
2930
#include "paimon/type_fwd.h"
3031
#include "paimon/visibility.h"
@@ -97,14 +98,19 @@ class PAIMON_EXPORT ScanFilter {
9798
public:
9899
ScanFilter(const std::shared_ptr<Predicate>& predicate,
99100
const std::vector<std::map<std::string, std::string>>& partition_filters,
100-
const std::optional<int32_t>& bucket_filter)
101+
const std::optional<int32_t>& bucket_filter,
102+
const std::shared_ptr<VectorSearch>& vector_search)
101103
: predicates_(predicate),
104+
vector_search_(vector_search),
102105
bucket_filter_(bucket_filter),
103106
partition_filters_(partition_filters) {}
104107

105108
std::shared_ptr<Predicate> GetPredicate() const {
106109
return predicates_;
107110
}
111+
std::shared_ptr<VectorSearch> GetVectorSearch() const {
112+
return vector_search_;
113+
}
108114
std::optional<int32_t> GetBucketFilter() const {
109115
return bucket_filter_;
110116
}
@@ -114,6 +120,7 @@ class PAIMON_EXPORT ScanFilter {
114120

115121
private:
116122
std::shared_ptr<Predicate> predicates_;
123+
std::shared_ptr<VectorSearch> vector_search_;
117124
std::optional<int32_t> bucket_filter_;
118125
std::vector<std::map<std::string, std::string>> partition_filters_;
119126
};
@@ -141,6 +148,9 @@ class PAIMON_EXPORT ScanContextBuilder {
141148
/// data retrieval.
142149
ScanContextBuilder& SetGlobalIndexResult(
143150
const std::shared_ptr<GlobalIndexResult>& global_index_result);
151+
152+
/// Set vector search for similarity search.
153+
ScanContextBuilder& SetVectorSearch(const std::shared_ptr<VectorSearch>& vector_search);
144154
/// The options added or set in `ScanContextBuilder` have high priority and will be merged with
145155
/// the options in table schema.
146156
ScanContextBuilder& AddOption(const std::string& key, const std::string& value);

src/paimon/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ set(PAIMON_COMMON_SRCS
4747
common/fs/resolving_file_system.cpp
4848
common/fs/file_system_factory.cpp
4949
common/global_index/complete_index_score_batch_reader.cpp
50-
common/global_index/bitmap_topk_global_index_result.cpp
50+
common/global_index/bitmap_vector_search_global_index_result.cpp
5151
common/global_index/bitmap_global_index_result.cpp
5252
common/global_index/global_index_result.cpp
5353
common/global_index/global_indexer_factory.cpp
@@ -333,7 +333,7 @@ if(PAIMON_BUILD_TESTS)
333333
common/global_index/global_index_result_test.cpp
334334
common/global_index/global_indexer_factory_test.cpp
335335
common/global_index/bitmap_global_index_result_test.cpp
336-
common/global_index/bitmap_topk_global_index_result_test.cpp
336+
common/global_index/bitmap_vector_search_global_index_result_test.cpp
337337
common/global_index/bitmap/bitmap_global_index_test.cpp
338338
common/io/byte_array_input_stream_test.cpp
339339
common/io/data_input_output_stream_test.cpp

src/paimon/common/global_index/bitmap/bitmap_global_index_test.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -216,9 +216,10 @@ TEST_F(BitmapGlobalIndexTest, TestStringType) {
216216
// result
217217
CheckResult(reader->VisitGreaterThan(lit_c).value(), {0, 1, 2, 3, 4});
218218

219-
// test visit topk
220-
ASSERT_NOK_WITH_MSG(reader->VisitTopK(10, {1.0f, 2.0f}, nullptr, nullptr),
221-
"FileIndexReaderWrapper is not supposed to handle topk query");
219+
// test visit vector search
220+
ASSERT_NOK_WITH_MSG(reader->VisitVectorSearch(std::make_shared<VectorSearch>(
221+
"f0", 10, std::vector<float>({1.0f, 2.0f}), nullptr, nullptr)),
222+
"FileIndexReaderWrapper is not supposed to handle vector search query");
222223
};
223224

224225
{

0 commit comments

Comments
 (0)